diff --git a/Readme.md b/Readme.md
index 53042ef..079fd63 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,7 +1,90 @@
-# Evaluation Pipeline
+[Paper](https://arxiv.org/pdf/2305.07895.pdf).
 
-[Paper](https://arxiv.org/pdf/2305.07895.pdf). V2 is updating.
+# Results
 
-Updating the instruction of evaluating large multimodal models on ocr tasks. 
+Results are available in answer_save folder. 
 
-Feel free to open issues for any suggestion or comment.
+![image](https://github.com/echo840/MultimodalOCR/assets/87795401/523e0421-7eca-4d15-89f1-3f7348321055)
+
+Visualization results
+
+![rvk](https://github.com/echo840/MultimodalOCR/assets/87795401/21982aba-d063-4a52-a045-8d16e0e98f71)
+
+
+# Data Download
+| Data file | Size |
+| --- | ---: |
+|[text recognition](https://pan.baidu.com/s/1Ba950d94u8RQmtqvkLBk-A) code:iwyn | 1.37GB |
+
+TextVQA, KIE and HME will be updated soon.
+
+We assume that your symlinked `data` directory has the following structure:
+
+```
+data
+|_ IC13_857
+|_ IC15_1811
+|_ ...
+|_ ESTVQA
+|_ textVQA
+|_ ...
+|_ FUNSD
+|_ POIE
+```
+
+
+# Usage
+
+eval on all datasets
+```Shell
+python eval.py --model_name LLaVA --eval_all
+```
+
+eval on one dataset
+```Shell
+python eval.py --model_name LLaVA --eval_textVQA
+```
+```Shell
+python eval.py --model_name LLaVA --eval_ocr --ocr_dataset_name "ct80 IIIT5K"
+```
+The results will be saved at answer folder.
+
+If you want to add a new model, please write its inference function under the folder "models", and update the get_model function in eval.py. An example inference code is as follows：
+
+```Shell
+import torch
+from PIL import Image
+from lavis.models import load_model_and_preprocess
+from ..process import pad_image, resize_image
+class lavis:
+    def __init__(self, model_name, model_type, device) -> None:
+        model, vis_processors, txt_processors = load_model_and_preprocess(name = model_name, model_type = model_type, is_eval=True, device=device)
+        self.model_name = model_name
+        self.model = model
+        self.vis_processors = vis_processors
+        self.txt_processors = txt_processors
+        self.device = device
+    def generate(self, image, question, name='resize'):
+        if 'opt' in self.model_name:
+            prompt = f'Question: {question} Answer:'
+        elif 't5' in self.model_name:
+            prompt = f'Question: {question} Short answer:'
+        else:
+            prompt = f'Question: {question} Answer:'
+        image = Image.open(image).convert("RGB")
+        if name == "pad":
+            image = pad_image(image, (224,224))
+        elif name == "resize":
+            image = resize_image(image, (224,224))
+        image = self.vis_processors["eval"](image).unsqueeze(0).to(self.device)
+        prompt = self.txt_processors["eval"](prompt)
+        answer = self.model.predict_answers(samples={"image": image, "text_input": prompt}, inference_method="generate", max_len=48, min_len=1)[0]
+        return answer
+```
+
+# Related Projects
+- [LLaVA](https://github.com/haotian-liu/LLaVA.git)
+- [MiniGPT4](https://github.com/Vision-CAIR/MiniGPT-4.git)
+- [mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl.git)
+- [OpenFlamingo](https://github.com/mlfoundations/open_flamingo.git)
+- [Lavis](https://github.com/salesforce/LAVIS.git)
diff --git a/models/MiniGPT4 b/models/MiniGPT4
deleted file mode 160000
index 22d8888..0000000
--- a/models/MiniGPT4
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 22d8888ca2cf0aac862f537e7d22ef5830036808
diff --git a/models/MiniGPT4/LICENSE.md b/models/MiniGPT4/LICENSE.md
new file mode 100644
index 0000000..9d5f2f9
--- /dev/null
+++ b/models/MiniGPT4/LICENSE.md
@@ -0,0 +1,14 @@
+BSD 3-Clause License
+
+Copyright 2023 Deyao Zhu
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/models/MiniGPT4/LICENSE_Lavis.md b/models/MiniGPT4/LICENSE_Lavis.md
new file mode 100644
index 0000000..9ba9791
--- /dev/null
+++ b/models/MiniGPT4/LICENSE_Lavis.md
@@ -0,0 +1,14 @@
+BSD 3-Clause License
+
+Copyright (c) 2022 Salesforce, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of Salesforce.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/models/MiniGPT4/MiniGPT4.py b/models/MiniGPT4/MiniGPT4.py
new file mode 100644
index 0000000..b808cb1
--- /dev/null
+++ b/models/MiniGPT4/MiniGPT4.py
@@ -0,0 +1,50 @@
+import argparse
+import os
+import random
+import numpy as np
+import torch.backends.cudnn as cudnn
+from minigpt4.common.config import Config
+from minigpt4.common.dist_utils import get_rank
+from minigpt4.common.registry import registry
+from minigpt4.conversation.conversation import Chat, CONV_VISION
+
+# imports modules for registration
+from minigpt4.datasets.builders import *
+from minigpt4.models import *
+from minigpt4.processors import *
+from minigpt4.runners import *
+from minigpt4.tasks import *
+from ..process import pad_image, resize_image
+from PIL import Image
+import torch
+class MiniGPT4:
+    def __init__(self, args, device='cuda:0') -> None:
+        args.cfg_path = args.MiniGPT4_cfg_path
+        args.options=None
+        cfg = Config(args)
+        model_config = cfg.model_cfg
+        model_config.device_8bit = int(device[-1])
+        model_cls = registry.get_model_class(model_config.arch)
+        model = model_cls.from_config(model_config).to(device)
+
+        vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
+        vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
+        self.chat = Chat(model, vis_processor, device = device)
+    def generate(self, image, question, name= 'resize', *kargs):
+        chat_state = CONV_VISION.copy()
+        num_beams = 1
+        temperature = 0.9
+        img_list = []
+        image = Image.open(image).convert('RGB')
+        if name == 'resize':
+            image = resize_image(image, (224,224))
+        llm_message = self.chat.upload_img(image, chat_state, img_list)
+        self.chat.ask(question, chat_state)
+        llm_message = self.chat.answer(conv=chat_state,
+                                    img_list=img_list,
+                                    num_beams=num_beams,
+                                    temperature=temperature,
+                                    max_new_tokens=128,
+                                    max_length=640)[0]
+        return llm_message
+
diff --git a/models/MiniGPT4/MiniGPT_4.pdf b/models/MiniGPT4/MiniGPT_4.pdf
new file mode 100644
index 0000000..5450815
Binary files /dev/null and b/models/MiniGPT4/MiniGPT_4.pdf differ
diff --git a/models/MiniGPT4/PrepareVicuna.md b/models/MiniGPT4/PrepareVicuna.md
new file mode 100644
index 0000000..0585e62
--- /dev/null
+++ b/models/MiniGPT4/PrepareVicuna.md
@@ -0,0 +1,35 @@
+## How to Prepare Vicuna Weight
+Vicuna is an open-source LLAMA-based LLM that has a performance close to ChatGPT. 
+We currently use the v0 version of Vicuna-13B. 
+
+To prepare Vicuna’s weight, first download Vicuna’s **delta** weight from [https://huggingface.co/lmsys/vicuna-13b-delta-v0](https://huggingface.co/lmsys/vicuna-13b-delta-v0). 
+In case you have git-lfs installed (https://git-lfs.com), this can be done by
+
+```
+git lfs install
+git clone https://huggingface.co/lmsys/vicuna-13b-delta-v0  # more powerful, need at least 24G gpu memory
+# or
+git clone https://huggingface.co/lmsys/vicuna-7b-delta-v0  # smaller, need 12G gpu memory
+```
+
+Note that this is not directly the working weight, but the difference between the working weight and the original weight of LLAMA-13B. (Due to LLAMA’s rules, we cannot distribute the weight of LLAMA.)
+
+Then, you need to obtain the original LLAMA-7B or LLAMA-13B weights in the HuggingFace format 
+either following the instruction provided by HuggingFace 
+[here](https://huggingface.co/docs/transformers/main/model_doc/llama) or from the Internet. 
+
+When these two weights are ready, we can use tools from Vicuna’s team to create the real working weight.
+First, Install their library that is compatible with v0 Vicuna by
+
+```
+pip install git+https://github.com/lm-sys/FastChat.git@v0.1.10
+```
+
+Then, run the following command to create the final working weight
+
+```
+python -m fastchat.model.apply_delta --base /path/to/llama-13bOR7b-hf/  --target /path/to/save/working/vicuna/weight/  --delta /path/to/vicuna-13bOR7b-delta-v0/
+```
+
+Now you are good to go!
+
diff --git a/models/MiniGPT4/README.md b/models/MiniGPT4/README.md
new file mode 100644
index 0000000..7aa29f2
--- /dev/null
+++ b/models/MiniGPT4/README.md
@@ -0,0 +1,170 @@
+# MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models
+[Deyao Zhu](https://tsutikgiau.github.io/)* (On Job Market!), [Jun Chen](https://junchen14.github.io/)* (On Job Market!), [Xiaoqian Shen](https://xiaoqian-shen.github.io), [Xiang Li](https://xiangli.ac.cn), and [Mohamed Elhoseiny](https://www.mohamed-elhoseiny.com/). *Equal Contribution
+
+**King Abdullah University of Science and Technology**
+
+<a href='https://minigpt-4.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a>  <a href='https://arxiv.org/abs/2304.10592'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a> <a href='https://huggingface.co/spaces/Vision-CAIR/minigpt4'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a> <a href='https://huggingface.co/Vision-CAIR/MiniGPT-4'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a> [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing) [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://www.youtube.com/watch?v=__tftoxpBAw&feature=youtu.be)
+
+
+## News
+We now provide a pretrained MiniGPT-4 aligned with Vicuna-7B! The demo GPU memory consumption now can be as low as 12GB.
+
+
+## Online Demo
+
+Click the image to chat with MiniGPT-4 around your images
+[![demo](figs/online_demo.png)](https://minigpt-4.github.io)
+
+
+## Examples
+  |   |   |
+:-------------------------:|:-------------------------:
+![find wild](figs/examples/wop_2.png) |  ![write story](figs/examples/ad_2.png)
+![solve problem](figs/examples/fix_1.png)  |  ![write Poem](figs/examples/rhyme_1.png)
+
+More examples can be found in the [project page](https://minigpt-4.github.io).
+
+
+
+## Introduction
+- MiniGPT-4 aligns a frozen visual encoder from BLIP-2 with a frozen LLM, Vicuna, using just one projection layer. 
+- We train MiniGPT-4 with two stages. The first traditional pretraining stage is trained using roughly 5 million aligned image-text pairs in 10 hours using 4 A100s. After the first stage, Vicuna is able to understand the image. But the generation ability of Vicuna is heavilly impacted.
+- To address this issue and improve usability, we propose a novel way to create high-quality image-text pairs by the model itself and ChatGPT together. Based on this, we then create a small (3500 pairs in total) yet high-quality dataset.
+- The second finetuning stage is trained on this dataset in a conversation template to significantly improve its generation reliability and overall usability. To our surprise, this stage is computationally efficient and takes only around 7 minutes with a single A100.
+- MiniGPT-4 yields many emerging vision-language capabilities similar to those demonstrated in GPT-4. 
+
+
+![overview](figs/overview.png)
+
+
+## Getting Started
+### Installation
+
+**1. Prepare the code and the environment**
+
+Git clone our repository, creating a python environment and ativate it via the following command
+
+```bash
+git clone https://github.com/Vision-CAIR/MiniGPT-4.git
+cd MiniGPT-4
+conda env create -f environment.yml
+conda activate minigpt4
+```
+
+
+**2. Prepare the pretrained Vicuna weights**
+
+The current version of MiniGPT-4 is built on the v0 versoin of Vicuna-13B.
+Please refer to our instruction [here](PrepareVicuna.md) 
+to prepare the Vicuna weights.
+The final weights would be in a single folder in a structure similar to the following:
+
+```
+vicuna_weights
+├── config.json
+├── generation_config.json
+├── pytorch_model.bin.index.json
+├── pytorch_model-00001-of-00003.bin
+...   
+```
+
+Then, set the path to the vicuna weight in the model config file 
+[here](minigpt4/configs/models/minigpt4.yaml#L16) at Line 16.
+
+**3. Prepare the pretrained MiniGPT-4 checkpoint**
+
+Download the pretrained checkpoints according to the Vicuna model you prepare.
+
+|                                Checkpoint Aligned with Vicuna 13B                                |                               Checkpoint Aligned with Vicuna 7B                                |
+:------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------:
+ [Downlad](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) | [Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing) 
+
+
+Then, set the path to the pretrained checkpoint in the evaluation config file 
+in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Line 11. 
+
+
+
+### Launching Demo Locally
+
+Try out our demo [demo.py](demo.py) on your local machine by running
+
+```
+python demo.py --cfg-path eval_configs/minigpt4_eval.yaml  --gpu-id 0
+```
+
+To save GPU memory, Vicuna loads as 8 bit by default, with a beam search width of 1. 
+This configuration requires about 23G GPU memory for Vicuna 13B and 11.5G GPU memory for Vicuna 7B. 
+For more powerful GPUs, you can run the model
+in 16 bit by setting low_resource to False in the config file 
+[minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml) and use a larger beam search width.
+
+Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run our code on [Colab](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing)
+
+
+### Training
+The training of MiniGPT-4 contains two alignment stages.
+
+**1. First pretraining stage**
+
+In the first pretrained stage, the model is trained using image-text pairs from Laion and CC datasets
+to align the vision and language model. To download and prepare the datasets, please check 
+our [first stage dataset preparation instruction](dataset/README_1_STAGE.md). 
+After the first stage, the visual features are mapped and can be understood by the language
+model.
+To launch the first stage training, run the following command. In our experiments, we use 4 A100. 
+You can change the save path in the config file 
+[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage1_pretrain.yaml)
+
+```bash
+torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage1_pretrain.yaml
+```
+
+A MiniGPT-4 checkpoint with only stage one training can be downloaded 
+[here (13B)](https://drive.google.com/file/d/1u9FRRBB3VovP1HxCAlpD9Lw4t4P6-Yq8/view?usp=share_link) or [here (7B)](https://drive.google.com/file/d/1HihQtCEXUyBM1i9DQbaK934wW3TZi-h5/view?usp=share_link).
+Compared to the model after stage two, this checkpoint generate incomplete and repeated sentences frequently.
+
+
+**2. Second finetuning stage**
+
+In the second stage, we use a small high quality image-text pair dataset created by ourselves
+and convert it to a conversation format to further align MiniGPT-4.
+To download and prepare our second stage dataset, please check our 
+[second stage dataset preparation instruction](dataset/README_2_STAGE.md).
+To launch the second stage alignment, 
+first specify the path to the checkpoint file trained in stage 1 in 
+[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage2_finetune.yaml).
+You can also specify the output path there. 
+Then, run the following command. In our experiments, we use 1 A100.
+
+```bash
+torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage2_finetune.yaml
+```
+
+After the second stage alignment, MiniGPT-4 is able to talk about the image coherently and user-friendly. 
+
+
+
+
+## Acknowledgement
+
++ [BLIP2](https://huggingface.co/docs/transformers/main/model_doc/blip-2) The model architecture of MiniGPT-4 follows BLIP-2. Don't forget to check this great open-source work if you don't know it before!
++ [Lavis](https://github.com/salesforce/LAVIS) This repository is built upon Lavis!
++ [Vicuna](https://github.com/lm-sys/FastChat) The fantastic language ability of Vicuna with only 13B parameters is just amazing. And it is open-source!
+
+
+If you're using MiniGPT-4 in your research or applications, please cite using this BibTeX:
+```bibtex
+@article{zhu2023minigpt,
+  title={MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models},
+  author={Zhu, Deyao and Chen, Jun and Shen, Xiaoqian and Li, Xiang and Elhoseiny, Mohamed},
+  journal={arXiv preprint arXiv:2304.10592},
+  year={2023}
+}
+```
+
+
+## License
+This repository is under [BSD 3-Clause License](LICENSE.md).
+Many codes are based on [Lavis](https://github.com/salesforce/LAVIS) with 
+BSD 3-Clause License [here](LICENSE_Lavis.md).
diff --git a/models/MiniGPT4/__pycache__/MiniGPT4.cpython-310.pyc b/models/MiniGPT4/__pycache__/MiniGPT4.cpython-310.pyc
new file mode 100644
index 0000000..eb4a707
Binary files /dev/null and b/models/MiniGPT4/__pycache__/MiniGPT4.cpython-310.pyc differ
diff --git a/models/MiniGPT4/dataset/README_1_STAGE.md b/models/MiniGPT4/dataset/README_1_STAGE.md
new file mode 100644
index 0000000..47ffaae
--- /dev/null
+++ b/models/MiniGPT4/dataset/README_1_STAGE.md
@@ -0,0 +1,96 @@
+## Download the filtered Conceptual Captions, SBU, LAION datasets
+
+### Pre-training datasets download:
+We use the filtered synthetic captions prepared by BLIP. For more details about the dataset, please refer to [BLIP](https://github.com/salesforce/BLIP).
+
+It requires ~2.3T to store LAION and CC3M+CC12M+SBU datasets
+
+Image source | Filtered synthetic caption by ViT-L
+--- | :---:
+CC3M+CC12M+SBU | <a href="https://storage.googleapis.com/sfr-vision-language-research/BLIP/datasets/ccs_synthetic_filtered_large.json">Download</a>
+LAION115M |  <a href="https://storage.googleapis.com/sfr-vision-language-research/BLIP/datasets/laion_synthetic_filtered_large.json">Download</a>
+
+This will download two json files 
+```
+ccs_synthetic_filtered_large.json
+laion_synthetic_filtered_large.json
+```
+
+## prepare the data step-by-step
+
+
+### setup the dataset folder and move the annotation file to the data storage folder
+```
+export MINIGPT4_DATASET=/YOUR/PATH/FOR/LARGE/DATASET/
+mkdir ${MINIGPT4_DATASET}/cc_sbu
+mkdir ${MINIGPT4_DATASET}/laion
+mv ccs_synthetic_filtered_large.json ${MINIGPT4_DATASET}/cc_sbu
+mv laion_synthetic_filtered_large.json ${MINIGPT4_DATASET}/laion
+```
+
+### Convert the scripts to data storate folder
+```
+cp convert_cc_sbu.py ${MINIGPT4_DATASET}/cc_sbu
+cp download_cc_sbu.sh ${MINIGPT4_DATASET}/cc_sbu
+cp convert_laion.py ${MINIGPT4_DATASET}/laion
+cp download_laion.sh ${MINIGPT4_DATASET}/laion
+```
+
+
+### Convert the laion and cc_sbu annotation file format to be img2dataset format
+```
+cd ${MINIGPT4_DATASET}/cc_sbu
+python convert_cc_sbu.py
+
+cd ${MINIGPT4_DATASET}/laion
+python convert_laion.py
+```
+
+### Download the datasets with img2dataset
+```
+cd ${MINIGPT4_DATASET}/cc_sbu
+sh download_cc_sbu.sh
+cd ${MINIGPT4_DATASET}/laion
+sh download_laion.sh
+```
+
+
+The final dataset structure
+
+```
+.
+├── ${MINIGPT4_DATASET}
+│   ├── cc_sbu
+│       ├── convert_cc_sbu.py
+│       ├── download_cc_sbu.sh
+│       ├── ccs_synthetic_filtered_large.json
+│       ├── ccs_synthetic_filtered_large.tsv
+│       └── cc_sbu_dataset
+│           ├── 00000.tar
+│           ├── 00000.parquet
+│           ...
+│   ├── laion
+│       ├── convert_laion.py
+│       ├── download_laion.sh
+│       ├── laion_synthetic_filtered_large.json
+│       ├── laion_synthetic_filtered_large.tsv
+│       └── laion_dataset
+│           ├── 00000.tar
+│           ├── 00000.parquet
+│           ...
+...   
+```
+
+
+## Set up the dataset configuration files
+
+Then, set up the LAION dataset loading path in 
+[here](../minigpt4/configs/datasets/laion/defaults.yaml#L5) at Line 5 as 
+${MINIGPT4_DATASET}/laion/laion_dataset/{00000..10488}.tar
+
+and the Conceptual Captoin and SBU datasets loading path in 
+[here](../minigpt4/configs/datasets/cc_sbu/defaults.yaml#L5) at Line 5 as 
+${MINIGPT4_DATASET}/cc_sbu/cc_sbu_dataset/{00000..01255}.tar
+
+
+
diff --git a/models/MiniGPT4/dataset/README_2_STAGE.md b/models/MiniGPT4/dataset/README_2_STAGE.md
new file mode 100644
index 0000000..b826765
--- /dev/null
+++ b/models/MiniGPT4/dataset/README_2_STAGE.md
@@ -0,0 +1,19 @@
+## Second Stage Data Preparation
+
+Our second stage dataset can be downloaded from 
+[here](https://drive.google.com/file/d/1nJXhoEcy3KTExr17I7BXqY5Y9Lx_-n-9/view?usp=share_link) 
+After extraction, you will get a data follder with the following structure:
+
+```
+cc_sbu_align
+├── filter_cap.json
+└── image
+    ├── 2.jpg
+    ├── 3.jpg
+    ...   
+```
+
+Put the folder to any path you want.
+Then, set up the dataset path in the dataset config file 
+[here](../minigpt4/configs/datasets/cc_sbu/align.yaml#L5) at Line 5.
+
diff --git a/models/MiniGPT4/dataset/convert_cc_sbu.py b/models/MiniGPT4/dataset/convert_cc_sbu.py
new file mode 100644
index 0000000..8c325ed
--- /dev/null
+++ b/models/MiniGPT4/dataset/convert_cc_sbu.py
@@ -0,0 +1,20 @@
+import json
+import csv
+
+# specify input and output file paths
+input_file = 'ccs_synthetic_filtered_large.json'
+output_file = 'ccs_synthetic_filtered_large.tsv'
+
+# load JSON data from input file
+with open(input_file, 'r') as f:
+    data = json.load(f)
+
+# extract header and data from JSON
+header = data[0].keys()
+rows = [x.values() for x in data]
+
+# write data to TSV file
+with open(output_file, 'w') as f:
+    writer = csv.writer(f, delimiter='\t')
+    writer.writerow(header)
+    writer.writerows(rows)
diff --git a/models/MiniGPT4/dataset/convert_laion.py b/models/MiniGPT4/dataset/convert_laion.py
new file mode 100644
index 0000000..b793579
--- /dev/null
+++ b/models/MiniGPT4/dataset/convert_laion.py
@@ -0,0 +1,20 @@
+import json
+import csv
+
+# specify input and output file paths
+input_file = 'laion_synthetic_filtered_large.json'
+output_file = 'laion_synthetic_filtered_large.tsv'
+
+# load JSON data from input file
+with open(input_file, 'r') as f:
+    data = json.load(f)
+
+# extract header and data from JSON
+header = data[0].keys()
+rows = [x.values() for x in data]
+
+# write data to TSV file
+with open(output_file, 'w') as f:
+    writer = csv.writer(f, delimiter='\t')
+    writer.writerow(header)
+    writer.writerows(rows)
diff --git a/models/MiniGPT4/dataset/download_cc_sbu.sh b/models/MiniGPT4/dataset/download_cc_sbu.sh
new file mode 100644
index 0000000..ec102bd
--- /dev/null
+++ b/models/MiniGPT4/dataset/download_cc_sbu.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+img2dataset --url_list ccs_synthetic_filtered_large.tsv --input_format "tsv"\
+         --url_col "url" --caption_col "caption" --output_format webdataset\
+           --output_folder cc_sbu_dataset --processes_count 16 --thread_count 128 --image_size 256 \
+             --enable_wandb True
diff --git a/models/MiniGPT4/dataset/download_laion.sh b/models/MiniGPT4/dataset/download_laion.sh
new file mode 100644
index 0000000..5de38e2
--- /dev/null
+++ b/models/MiniGPT4/dataset/download_laion.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+img2dataset --url_list laion_synthetic_filtered_large.tsv --input_format "tsv"\
+         --url_col "url" --caption_col "caption" --output_format webdataset\
+           --output_folder laion_dataset --processes_count 16 --thread_count 128 --image_size 256 \
+             --enable_wandb True
diff --git a/models/MiniGPT4/demo.py b/models/MiniGPT4/demo.py
new file mode 100644
index 0000000..6e25640
--- /dev/null
+++ b/models/MiniGPT4/demo.py
@@ -0,0 +1,154 @@
+import argparse
+import os
+import random
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import gradio as gr
+
+from minigpt4.common.config import Config
+from minigpt4.common.dist_utils import get_rank
+from minigpt4.common.registry import registry
+from minigpt4.conversation.conversation import Chat, CONV_VISION
+
+# imports modules for registration
+from minigpt4.datasets.builders import *
+from minigpt4.models import *
+from minigpt4.processors import *
+from minigpt4.runners import *
+from minigpt4.tasks import *
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Demo")
+    parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
+    parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.")
+    parser.add_argument(
+        "--options",
+        nargs="+",
+        help="override some settings in the used config, the key-value pair "
+        "in xxx=yyy format will be merged into config file (deprecate), "
+        "change to --cfg-options instead.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def setup_seeds(config):
+    import pdb;pdb.set_trace()
+    seed = config.run_cfg.seed + get_rank()
+
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+
+
+# ========================================
+#             Model Initialization
+# ========================================
+
+print('Initializing Chat')
+args = parse_args()
+cfg = Config(args)
+import pdb;pdb.set_trace()
+model_config = cfg.model_cfg
+model_config.device_8bit = args.gpu_id
+model_cls = registry.get_model_class(model_config.arch)
+model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
+
+vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
+vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
+chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
+print('Initialization Finished')
+
+# ========================================
+#             Gradio Setting
+# ========================================
+
+def gradio_reset(chat_state, img_list):
+    if chat_state is not None:
+        chat_state.messages = []
+    if img_list is not None:
+        img_list = []
+    return None, gr.update(value=None, interactive=True), gr.update(placeholder='Please upload your image first', interactive=False),gr.update(value="Upload & Start Chat", interactive=True), chat_state, img_list
+
+def upload_img(gr_img, text_input, chat_state):
+    if gr_img is None:
+        return None, None, gr.update(interactive=True), chat_state, None
+    chat_state = CONV_VISION.copy()
+    img_list = []
+    llm_message = chat.upload_img(gr_img, chat_state, img_list)
+    return gr.update(interactive=False), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_list
+
+def gradio_ask(user_message, chatbot, chat_state):
+    if len(user_message) == 0:
+        return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state
+    chat.ask(user_message, chat_state)
+    chatbot = chatbot + [[user_message, None]]
+    return '', chatbot, chat_state
+
+
+def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature):
+    llm_message = chat.answer(conv=chat_state,
+                              img_list=img_list,
+                              num_beams=num_beams,
+                              temperature=temperature,
+                              max_new_tokens=300,
+                              max_length=2000)[0]
+    chatbot[-1][1] = llm_message
+    return chatbot, chat_state, img_list
+
+title = """<h1 align="center">Demo of MiniGPT-4</h1>"""
+description = """<h3>This is the demo of MiniGPT-4. Upload your images and start chatting!</h3>"""
+article = """<p><a href='https://minigpt-4.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p><p><a href='https://github.com/Vision-CAIR/MiniGPT-4'><img src='https://img.shields.io/badge/Github-Code-blue'></a></p><p><a href='https://raw.githubusercontent.com/Vision-CAIR/MiniGPT-4/main/MiniGPT_4.pdf'><img src='https://img.shields.io/badge/Paper-PDF-red'></a></p>
+"""
+
+#TODO show examples below
+
+with gr.Blocks() as demo:
+    gr.Markdown(title)
+    gr.Markdown(description)
+    gr.Markdown(article)
+
+    with gr.Row():
+        with gr.Column(scale=0.5):
+            image = gr.Image(type="pil")
+            upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
+            clear = gr.Button("Restart")
+            
+            num_beams = gr.Slider(
+                minimum=1,
+                maximum=10,
+                value=1,
+                step=1,
+                interactive=True,
+                label="beam search numbers)",
+            )
+            
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=2.0,
+                value=1.0,
+                step=0.1,
+                interactive=True,
+                label="Temperature",
+            )
+
+        with gr.Column():
+            chat_state = gr.State()
+            img_list = gr.State()
+            chatbot = gr.Chatbot(label='MiniGPT-4')
+            text_input = gr.Textbox(label='User', placeholder='Please upload your image first', interactive=False)
+    
+    upload_button.click(upload_img, [image, text_input, chat_state], [image, text_input, upload_button, chat_state, img_list])
+    
+    text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
+        gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
+    )
+    clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list], queue=False)
+
+demo.launch(share=True, enable_queue=True)
diff --git a/models/MiniGPT4/environment.yml b/models/MiniGPT4/environment.yml
new file mode 100644
index 0000000..d5cfcf8
--- /dev/null
+++ b/models/MiniGPT4/environment.yml
@@ -0,0 +1,63 @@
+name: minigpt4
+channels:
+  - pytorch
+  - defaults
+  - anaconda
+dependencies:
+  - python=3.9
+  - cudatoolkit
+  - pip
+  - pytorch=1.12.1
+  - pytorch-mutex=1.0=cuda
+  - torchaudio=0.12.1
+  - torchvision=0.13.1
+  - pip:
+    - accelerate==0.16.0
+    - aiohttp==3.8.4
+    - aiosignal==1.3.1
+    - async-timeout==4.0.2
+    - attrs==22.2.0
+    - bitsandbytes==0.37.0
+    - cchardet==2.1.7
+    - chardet==5.1.0
+    - contourpy==1.0.7
+    - cycler==0.11.0
+    - filelock==3.9.0
+    - fonttools==4.38.0
+    - frozenlist==1.3.3
+    - huggingface-hub==0.13.4
+    - importlib-resources==5.12.0
+    - kiwisolver==1.4.4
+    - matplotlib==3.7.0
+    - multidict==6.0.4
+    - openai==0.27.0
+    - packaging==23.0
+    - psutil==5.9.4
+    - pycocotools==2.0.6
+    - pyparsing==3.0.9
+    - python-dateutil==2.8.2
+    - pyyaml==6.0
+    - regex==2022.10.31
+    - tokenizers==0.13.2
+    - tqdm==4.64.1
+    - transformers==4.28.0
+    - timm==0.6.13
+    - spacy==3.5.1
+    - webdataset==0.2.48
+    - scikit-learn==1.2.2
+    - scipy==1.10.1
+    - yarl==1.8.2
+    - zipp==3.14.0
+    - omegaconf==2.3.0
+    - opencv-python==4.7.0.72
+    - iopath==0.1.10
+    - decord==0.6.0
+    - tenacity==8.2.2
+    - peft
+    - pycocoevalcap
+    - sentence-transformers
+    - umap-learn
+    - notebook
+    - gradio==3.24.1
+    - gradio-client==0.0.8
+    - wandb
diff --git a/models/MiniGPT4/eval_configs/minigpt4_eval.yaml b/models/MiniGPT4/eval_configs/minigpt4_eval.yaml
new file mode 100644
index 0000000..ff89879
--- /dev/null
+++ b/models/MiniGPT4/eval_configs/minigpt4_eval.yaml
@@ -0,0 +1,25 @@
+model:
+  arch: mini_gpt4
+  model_type: pretrain_vicuna
+  freeze_vit: True
+  freeze_qformer: True
+  max_txt_len: 160
+  end_sym: "###"
+  low_resource: True
+  prompt_path: "./models/MiniGPT4/prompts/alignment.txt"
+  prompt_template: '###Human: {} ###Assistant: '
+  ckpt: './models/MiniGPT4/model_weight/projection/pretrained_minigpt4.pth'
+
+
+datasets:
+  cc_sbu_align:
+    vis_processor:
+      train:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+
+run:
+  task: image_text_pretrain
diff --git a/models/MiniGPT4/examples/ad_1.png b/models/MiniGPT4/examples/ad_1.png
new file mode 100644
index 0000000..d0378e4
Binary files /dev/null and b/models/MiniGPT4/examples/ad_1.png differ
diff --git a/models/MiniGPT4/examples/ad_2.png b/models/MiniGPT4/examples/ad_2.png
new file mode 100644
index 0000000..674248b
Binary files /dev/null and b/models/MiniGPT4/examples/ad_2.png differ
diff --git a/models/MiniGPT4/examples/cook_1.png b/models/MiniGPT4/examples/cook_1.png
new file mode 100644
index 0000000..d8cdb45
Binary files /dev/null and b/models/MiniGPT4/examples/cook_1.png differ
diff --git a/models/MiniGPT4/examples/cook_2.png b/models/MiniGPT4/examples/cook_2.png
new file mode 100644
index 0000000..d08272b
Binary files /dev/null and b/models/MiniGPT4/examples/cook_2.png differ
diff --git a/models/MiniGPT4/examples/describe_1.png b/models/MiniGPT4/examples/describe_1.png
new file mode 100644
index 0000000..02f3c92
Binary files /dev/null and b/models/MiniGPT4/examples/describe_1.png differ
diff --git a/models/MiniGPT4/examples/describe_2.png b/models/MiniGPT4/examples/describe_2.png
new file mode 100644
index 0000000..20bf8c7
Binary files /dev/null and b/models/MiniGPT4/examples/describe_2.png differ
diff --git a/models/MiniGPT4/examples/fact_1.png b/models/MiniGPT4/examples/fact_1.png
new file mode 100644
index 0000000..1f75228
Binary files /dev/null and b/models/MiniGPT4/examples/fact_1.png differ
diff --git a/models/MiniGPT4/examples/fact_2.png b/models/MiniGPT4/examples/fact_2.png
new file mode 100644
index 0000000..de6ef53
Binary files /dev/null and b/models/MiniGPT4/examples/fact_2.png differ
diff --git a/models/MiniGPT4/examples/fix_1.png b/models/MiniGPT4/examples/fix_1.png
new file mode 100644
index 0000000..023cfe6
Binary files /dev/null and b/models/MiniGPT4/examples/fix_1.png differ
diff --git a/models/MiniGPT4/examples/fix_2.png b/models/MiniGPT4/examples/fix_2.png
new file mode 100644
index 0000000..f60da5f
Binary files /dev/null and b/models/MiniGPT4/examples/fix_2.png differ
diff --git a/models/MiniGPT4/examples/fun_1.png b/models/MiniGPT4/examples/fun_1.png
new file mode 100644
index 0000000..f720ea6
Binary files /dev/null and b/models/MiniGPT4/examples/fun_1.png differ
diff --git a/models/MiniGPT4/examples/fun_2.png b/models/MiniGPT4/examples/fun_2.png
new file mode 100644
index 0000000..1d37a80
Binary files /dev/null and b/models/MiniGPT4/examples/fun_2.png differ
diff --git a/models/MiniGPT4/examples/logo_1.png b/models/MiniGPT4/examples/logo_1.png
new file mode 100644
index 0000000..8bbe438
Binary files /dev/null and b/models/MiniGPT4/examples/logo_1.png differ
diff --git a/models/MiniGPT4/examples/op_1.png b/models/MiniGPT4/examples/op_1.png
new file mode 100644
index 0000000..3dbb2ff
Binary files /dev/null and b/models/MiniGPT4/examples/op_1.png differ
diff --git a/models/MiniGPT4/examples/op_2.png b/models/MiniGPT4/examples/op_2.png
new file mode 100644
index 0000000..2cd3e1f
Binary files /dev/null and b/models/MiniGPT4/examples/op_2.png differ
diff --git a/models/MiniGPT4/examples/people_1.png b/models/MiniGPT4/examples/people_1.png
new file mode 100644
index 0000000..7e95c42
Binary files /dev/null and b/models/MiniGPT4/examples/people_1.png differ
diff --git a/models/MiniGPT4/examples/people_2.png b/models/MiniGPT4/examples/people_2.png
new file mode 100644
index 0000000..aec6c83
Binary files /dev/null and b/models/MiniGPT4/examples/people_2.png differ
diff --git a/models/MiniGPT4/examples/rhyme_1.png b/models/MiniGPT4/examples/rhyme_1.png
new file mode 100644
index 0000000..7d13387
Binary files /dev/null and b/models/MiniGPT4/examples/rhyme_1.png differ
diff --git a/models/MiniGPT4/examples/rhyme_2.png b/models/MiniGPT4/examples/rhyme_2.png
new file mode 100644
index 0000000..6cf9bf8
Binary files /dev/null and b/models/MiniGPT4/examples/rhyme_2.png differ
diff --git a/models/MiniGPT4/examples/story_1.png b/models/MiniGPT4/examples/story_1.png
new file mode 100644
index 0000000..3eb6ccb
Binary files /dev/null and b/models/MiniGPT4/examples/story_1.png differ
diff --git a/models/MiniGPT4/examples/story_2.png b/models/MiniGPT4/examples/story_2.png
new file mode 100644
index 0000000..9d37142
Binary files /dev/null and b/models/MiniGPT4/examples/story_2.png differ
diff --git a/models/MiniGPT4/examples/web_1.png b/models/MiniGPT4/examples/web_1.png
new file mode 100644
index 0000000..8943842
Binary files /dev/null and b/models/MiniGPT4/examples/web_1.png differ
diff --git a/models/MiniGPT4/examples/wop_1.png b/models/MiniGPT4/examples/wop_1.png
new file mode 100644
index 0000000..88f37d6
Binary files /dev/null and b/models/MiniGPT4/examples/wop_1.png differ
diff --git a/models/MiniGPT4/examples/wop_2.png b/models/MiniGPT4/examples/wop_2.png
new file mode 100644
index 0000000..8255974
Binary files /dev/null and b/models/MiniGPT4/examples/wop_2.png differ
diff --git a/models/MiniGPT4/figs/examples/ad_1.png b/models/MiniGPT4/figs/examples/ad_1.png
new file mode 100644
index 0000000..d0378e4
Binary files /dev/null and b/models/MiniGPT4/figs/examples/ad_1.png differ
diff --git a/models/MiniGPT4/figs/examples/ad_2.png b/models/MiniGPT4/figs/examples/ad_2.png
new file mode 100644
index 0000000..674248b
Binary files /dev/null and b/models/MiniGPT4/figs/examples/ad_2.png differ
diff --git a/models/MiniGPT4/figs/examples/cook_1.png b/models/MiniGPT4/figs/examples/cook_1.png
new file mode 100644
index 0000000..d8cdb45
Binary files /dev/null and b/models/MiniGPT4/figs/examples/cook_1.png differ
diff --git a/models/MiniGPT4/figs/examples/cook_2.png b/models/MiniGPT4/figs/examples/cook_2.png
new file mode 100644
index 0000000..d08272b
Binary files /dev/null and b/models/MiniGPT4/figs/examples/cook_2.png differ
diff --git a/models/MiniGPT4/figs/examples/describe_1.png b/models/MiniGPT4/figs/examples/describe_1.png
new file mode 100644
index 0000000..02f3c92
Binary files /dev/null and b/models/MiniGPT4/figs/examples/describe_1.png differ
diff --git a/models/MiniGPT4/figs/examples/describe_2.png b/models/MiniGPT4/figs/examples/describe_2.png
new file mode 100644
index 0000000..20bf8c7
Binary files /dev/null and b/models/MiniGPT4/figs/examples/describe_2.png differ
diff --git a/models/MiniGPT4/figs/examples/fact_1.png b/models/MiniGPT4/figs/examples/fact_1.png
new file mode 100644
index 0000000..1f75228
Binary files /dev/null and b/models/MiniGPT4/figs/examples/fact_1.png differ
diff --git a/models/MiniGPT4/figs/examples/fact_2.png b/models/MiniGPT4/figs/examples/fact_2.png
new file mode 100644
index 0000000..de6ef53
Binary files /dev/null and b/models/MiniGPT4/figs/examples/fact_2.png differ
diff --git a/models/MiniGPT4/figs/examples/fix_1.png b/models/MiniGPT4/figs/examples/fix_1.png
new file mode 100644
index 0000000..023cfe6
Binary files /dev/null and b/models/MiniGPT4/figs/examples/fix_1.png differ
diff --git a/models/MiniGPT4/figs/examples/fix_2.png b/models/MiniGPT4/figs/examples/fix_2.png
new file mode 100644
index 0000000..f60da5f
Binary files /dev/null and b/models/MiniGPT4/figs/examples/fix_2.png differ
diff --git a/models/MiniGPT4/figs/examples/fun_1.png b/models/MiniGPT4/figs/examples/fun_1.png
new file mode 100644
index 0000000..f720ea6
Binary files /dev/null and b/models/MiniGPT4/figs/examples/fun_1.png differ
diff --git a/models/MiniGPT4/figs/examples/fun_2.png b/models/MiniGPT4/figs/examples/fun_2.png
new file mode 100644
index 0000000..1d37a80
Binary files /dev/null and b/models/MiniGPT4/figs/examples/fun_2.png differ
diff --git a/models/MiniGPT4/figs/examples/logo_1.png b/models/MiniGPT4/figs/examples/logo_1.png
new file mode 100644
index 0000000..8bbe438
Binary files /dev/null and b/models/MiniGPT4/figs/examples/logo_1.png differ
diff --git a/models/MiniGPT4/figs/examples/op_1.png b/models/MiniGPT4/figs/examples/op_1.png
new file mode 100644
index 0000000..3dbb2ff
Binary files /dev/null and b/models/MiniGPT4/figs/examples/op_1.png differ
diff --git a/models/MiniGPT4/figs/examples/op_2.png b/models/MiniGPT4/figs/examples/op_2.png
new file mode 100644
index 0000000..2cd3e1f
Binary files /dev/null and b/models/MiniGPT4/figs/examples/op_2.png differ
diff --git a/models/MiniGPT4/figs/examples/people_1.png b/models/MiniGPT4/figs/examples/people_1.png
new file mode 100644
index 0000000..7e95c42
Binary files /dev/null and b/models/MiniGPT4/figs/examples/people_1.png differ
diff --git a/models/MiniGPT4/figs/examples/people_2.png b/models/MiniGPT4/figs/examples/people_2.png
new file mode 100644
index 0000000..aec6c83
Binary files /dev/null and b/models/MiniGPT4/figs/examples/people_2.png differ
diff --git a/models/MiniGPT4/figs/examples/rhyme_1.png b/models/MiniGPT4/figs/examples/rhyme_1.png
new file mode 100644
index 0000000..7d13387
Binary files /dev/null and b/models/MiniGPT4/figs/examples/rhyme_1.png differ
diff --git a/models/MiniGPT4/figs/examples/rhyme_2.png b/models/MiniGPT4/figs/examples/rhyme_2.png
new file mode 100644
index 0000000..6cf9bf8
Binary files /dev/null and b/models/MiniGPT4/figs/examples/rhyme_2.png differ
diff --git a/models/MiniGPT4/figs/examples/story_1.png b/models/MiniGPT4/figs/examples/story_1.png
new file mode 100644
index 0000000..3eb6ccb
Binary files /dev/null and b/models/MiniGPT4/figs/examples/story_1.png differ
diff --git a/models/MiniGPT4/figs/examples/story_2.png b/models/MiniGPT4/figs/examples/story_2.png
new file mode 100644
index 0000000..9d37142
Binary files /dev/null and b/models/MiniGPT4/figs/examples/story_2.png differ
diff --git a/models/MiniGPT4/figs/examples/web_1.png b/models/MiniGPT4/figs/examples/web_1.png
new file mode 100644
index 0000000..8943842
Binary files /dev/null and b/models/MiniGPT4/figs/examples/web_1.png differ
diff --git a/models/MiniGPT4/figs/examples/wop_1.png b/models/MiniGPT4/figs/examples/wop_1.png
new file mode 100644
index 0000000..88f37d6
Binary files /dev/null and b/models/MiniGPT4/figs/examples/wop_1.png differ
diff --git a/models/MiniGPT4/figs/examples/wop_2.png b/models/MiniGPT4/figs/examples/wop_2.png
new file mode 100644
index 0000000..8255974
Binary files /dev/null and b/models/MiniGPT4/figs/examples/wop_2.png differ
diff --git a/models/MiniGPT4/figs/online_demo.png b/models/MiniGPT4/figs/online_demo.png
new file mode 100644
index 0000000..716e438
Binary files /dev/null and b/models/MiniGPT4/figs/online_demo.png differ
diff --git a/models/MiniGPT4/figs/overview.png b/models/MiniGPT4/figs/overview.png
new file mode 100644
index 0000000..10b952e
Binary files /dev/null and b/models/MiniGPT4/figs/overview.png differ
diff --git a/models/MiniGPT4/minigpt4/__init__.py b/models/MiniGPT4/minigpt4/__init__.py
new file mode 100644
index 0000000..456b5a4
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/__init__.py
@@ -0,0 +1,30 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import sys
+
+from omegaconf import OmegaConf
+from minigpt4.common.registry import registry
+
+from minigpt4.datasets.builders import *
+from minigpt4.models import *
+from minigpt4.processors import *
+from minigpt4.tasks import *
+
+
+root_dir = os.path.dirname(os.path.abspath(__file__))
+default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
+
+registry.register_path("library_root", root_dir)
+repo_root = os.path.join(root_dir, "..")
+registry.register_path("repo_root", repo_root)
+cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
+registry.register_path("cache_root", cache_root)
+
+registry.register("MAX_INT", sys.maxsize)
+registry.register("SPLIT_NAMES", ["train", "val", "test"])
diff --git a/models/MiniGPT4/minigpt4/__pycache__/__init__.cpython-310.pyc b/models/MiniGPT4/minigpt4/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..249018a
Binary files /dev/null and b/models/MiniGPT4/minigpt4/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/common/__init__.py b/models/MiniGPT4/minigpt4/common/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/MiniGPT4/minigpt4/common/__pycache__/__init__.cpython-310.pyc b/models/MiniGPT4/minigpt4/common/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..7c3cea4
Binary files /dev/null and b/models/MiniGPT4/minigpt4/common/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/common/__pycache__/config.cpython-310.pyc b/models/MiniGPT4/minigpt4/common/__pycache__/config.cpython-310.pyc
new file mode 100644
index 0000000..2b5ebec
Binary files /dev/null and b/models/MiniGPT4/minigpt4/common/__pycache__/config.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/common/__pycache__/dist_utils.cpython-310.pyc b/models/MiniGPT4/minigpt4/common/__pycache__/dist_utils.cpython-310.pyc
new file mode 100644
index 0000000..11ae2a6
Binary files /dev/null and b/models/MiniGPT4/minigpt4/common/__pycache__/dist_utils.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/common/__pycache__/logger.cpython-310.pyc b/models/MiniGPT4/minigpt4/common/__pycache__/logger.cpython-310.pyc
new file mode 100644
index 0000000..37e1ec4
Binary files /dev/null and b/models/MiniGPT4/minigpt4/common/__pycache__/logger.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/common/__pycache__/registry.cpython-310.pyc b/models/MiniGPT4/minigpt4/common/__pycache__/registry.cpython-310.pyc
new file mode 100644
index 0000000..52f5590
Binary files /dev/null and b/models/MiniGPT4/minigpt4/common/__pycache__/registry.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/common/__pycache__/utils.cpython-310.pyc b/models/MiniGPT4/minigpt4/common/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000..a611dd6
Binary files /dev/null and b/models/MiniGPT4/minigpt4/common/__pycache__/utils.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/common/config.py b/models/MiniGPT4/minigpt4/common/config.py
new file mode 100644
index 0000000..e184b1f
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/common/config.py
@@ -0,0 +1,468 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import logging
+import json
+from typing import Dict
+
+from omegaconf import OmegaConf
+from minigpt4.common.registry import registry
+
+
+class Config:
+    def __init__(self, args):
+        self.config = {}
+
+        self.args = args
+
+        # Register the config and configuration for setup
+        registry.register("configuration", self)
+
+        user_config = self._build_opt_list(self.args.options)
+
+        config = OmegaConf.load(self.args.cfg_path)
+
+        runner_config = self.build_runner_config(config)
+        model_config = self.build_model_config(config, **user_config)
+        dataset_config = self.build_dataset_config(config)
+
+        # Validate the user-provided runner configuration
+        # model and dataset configuration are supposed to be validated by the respective classes
+        # [TODO] validate the model/dataset configuration
+        # self._validate_runner_config(runner_config)
+
+        # Override the default configuration with user options.
+        self.config = OmegaConf.merge(
+            runner_config, model_config, dataset_config, user_config
+        )
+
+    def _validate_runner_config(self, runner_config):
+        """
+        This method validates the configuration, such that
+            1) all the user specified options are valid;
+            2) no type mismatches between the user specified options and the config.
+        """
+        runner_config_validator = create_runner_config_validator()
+        runner_config_validator.validate(runner_config)
+
+    def _build_opt_list(self, opts):
+        opts_dot_list = self._convert_to_dot_list(opts)
+        return OmegaConf.from_dotlist(opts_dot_list)
+
+    @staticmethod
+    def build_model_config(config, **kwargs):
+        model = config.get("model", None)
+        assert model is not None, "Missing model configuration file."
+
+        model_cls = registry.get_model_class(model.arch)
+        assert model_cls is not None, f"Model '{model.arch}' has not been registered."
+
+        model_type = kwargs.get("model.model_type", None)
+        if not model_type:
+            model_type = model.get("model_type", None)
+        # else use the model type selected by user.
+
+        assert model_type is not None, "Missing model_type."
+
+        model_config_path = model_cls.default_config_path(model_type=model_type)
+
+        model_config = OmegaConf.create()
+        # hierarchy override, customized config > default config
+        model_config = OmegaConf.merge(
+            model_config,
+            OmegaConf.load(model_config_path),
+            {"model": config["model"]},
+        )
+
+        return model_config
+
+    @staticmethod
+    def build_runner_config(config):
+        return {"run": config.run}
+
+    @staticmethod
+    def build_dataset_config(config):
+        datasets = config.get("datasets", None)
+        if datasets is None:
+            raise KeyError(
+                "Expecting 'datasets' as the root key for dataset configuration."
+            )
+
+        dataset_config = OmegaConf.create()
+
+        for dataset_name in datasets:
+            builder_cls = registry.get_builder_class(dataset_name)
+
+            dataset_config_type = datasets[dataset_name].get("type", "default")
+            dataset_config_path = builder_cls.default_config_path(
+                type=dataset_config_type
+            )
+
+            # hierarchy override, customized config > default config
+            dataset_config = OmegaConf.merge(
+                dataset_config,
+                OmegaConf.load(dataset_config_path),
+                {"datasets": {dataset_name: config["datasets"][dataset_name]}},
+            )
+
+        return dataset_config
+
+    def _convert_to_dot_list(self, opts):
+        if opts is None:
+            opts = []
+
+        if len(opts) == 0:
+            return opts
+
+        has_equal = opts[0].find("=") != -1
+
+        if has_equal:
+            return opts
+
+        return [(opt + "=" + value) for opt, value in zip(opts[0::2], opts[1::2])]
+
+    def get_config(self):
+        return self.config
+
+    @property
+    def run_cfg(self):
+        return self.config.run
+
+    @property
+    def datasets_cfg(self):
+        return self.config.datasets
+
+    @property
+    def model_cfg(self):
+        return self.config.model
+
+    def pretty_print(self):
+        logging.info("\n=====  Running Parameters    =====")
+        logging.info(self._convert_node_to_json(self.config.run))
+
+        logging.info("\n======  Dataset Attributes  ======")
+        datasets = self.config.datasets
+
+        for dataset in datasets:
+            if dataset in self.config.datasets:
+                logging.info(f"\n======== {dataset} =======")
+                dataset_config = self.config.datasets[dataset]
+                logging.info(self._convert_node_to_json(dataset_config))
+            else:
+                logging.warning(f"No dataset named '{dataset}' in config. Skipping")
+
+        logging.info(f"\n======  Model Attributes  ======")
+        logging.info(self._convert_node_to_json(self.config.model))
+
+    def _convert_node_to_json(self, node):
+        container = OmegaConf.to_container(node, resolve=True)
+        return json.dumps(container, indent=4, sort_keys=True)
+
+    def to_dict(self):
+        return OmegaConf.to_container(self.config)
+
+
+def node_to_dict(node):
+    return OmegaConf.to_container(node)
+
+
+class ConfigValidator:
+    """
+    This is a preliminary implementation to centralize and validate the configuration.
+    May be altered in the future.
+
+    A helper class to validate configurations from yaml file.
+
+    This serves the following purposes:
+        1. Ensure all the options in the yaml are defined, raise error if not.
+        2. when type mismatches are found, the validator will raise an error.
+        3. a central place to store and display helpful messages for supported configurations.
+
+    """
+
+    class _Argument:
+        def __init__(self, name, choices=None, type=None, help=None):
+            self.name = name
+            self.val = None
+            self.choices = choices
+            self.type = type
+            self.help = help
+
+        def __str__(self):
+            s = f"{self.name}={self.val}"
+            if self.type is not None:
+                s += f", ({self.type})"
+            if self.choices is not None:
+                s += f", choices: {self.choices}"
+            if self.help is not None:
+                s += f", ({self.help})"
+            return s
+
+    def __init__(self, description):
+        self.description = description
+
+        self.arguments = dict()
+
+        self.parsed_args = None
+
+    def __getitem__(self, key):
+        assert self.parsed_args is not None, "No arguments parsed yet."
+
+        return self.parsed_args[key]
+
+    def __str__(self) -> str:
+        return self.format_help()
+
+    def add_argument(self, *args, **kwargs):
+        """
+        Assume the first argument is the name of the argument.
+        """
+        self.arguments[args[0]] = self._Argument(*args, **kwargs)
+
+    def validate(self, config=None):
+        """
+        Convert yaml config (dict-like) to list, required by argparse.
+        """
+        for k, v in config.items():
+            assert (
+                k in self.arguments
+            ), f"""{k} is not a valid argument. Support arguments are {self.format_arguments()}."""
+
+            if self.arguments[k].type is not None:
+                try:
+                    self.arguments[k].val = self.arguments[k].type(v)
+                except ValueError:
+                    raise ValueError(f"{k} is not a valid {self.arguments[k].type}.")
+
+            if self.arguments[k].choices is not None:
+                assert (
+                    v in self.arguments[k].choices
+                ), f"""{k} must be one of {self.arguments[k].choices}."""
+
+        return config
+
+    def format_arguments(self):
+        return str([f"{k}" for k in sorted(self.arguments.keys())])
+
+    def format_help(self):
+        # description + key-value pair string for each argument
+        help_msg = str(self.description)
+        return help_msg + ", available arguments: " + self.format_arguments()
+
+    def print_help(self):
+        # display help message
+        print(self.format_help())
+
+
+def create_runner_config_validator():
+    validator = ConfigValidator(description="Runner configurations")
+
+    validator.add_argument(
+        "runner",
+        type=str,
+        choices=["runner_base", "runner_iter"],
+        help="""Runner to use. The "runner_base" uses epoch-based training while iter-based
+            runner runs based on iters. Default: runner_base""",
+    )
+    # add argumetns for training dataset ratios
+    validator.add_argument(
+        "train_dataset_ratios",
+        type=Dict[str, float],
+        help="""Ratios of training dataset. This is used in iteration-based runner.
+        Do not support for epoch-based runner because how to define an epoch becomes tricky.
+        Default: None""",
+    )
+    validator.add_argument(
+        "max_iters",
+        type=float,
+        help="Maximum number of iterations to run.",
+    )
+    validator.add_argument(
+        "max_epoch",
+        type=int,
+        help="Maximum number of epochs to run.",
+    )
+    # add arguments for iters_per_inner_epoch
+    validator.add_argument(
+        "iters_per_inner_epoch",
+        type=float,
+        help="Number of iterations per inner epoch. This is required when runner is runner_iter.",
+    )
+    lr_scheds_choices = registry.list_lr_schedulers()
+    validator.add_argument(
+        "lr_sched",
+        type=str,
+        choices=lr_scheds_choices,
+        help="Learning rate scheduler to use, from {}".format(lr_scheds_choices),
+    )
+    task_choices = registry.list_tasks()
+    validator.add_argument(
+        "task",
+        type=str,
+        choices=task_choices,
+        help="Task to use, from {}".format(task_choices),
+    )
+    # add arguments for init_lr
+    validator.add_argument(
+        "init_lr",
+        type=float,
+        help="Initial learning rate. This will be the learning rate after warmup and before decay.",
+    )
+    # add arguments for min_lr
+    validator.add_argument(
+        "min_lr",
+        type=float,
+        help="Minimum learning rate (after decay).",
+    )
+    # add arguments for warmup_lr
+    validator.add_argument(
+        "warmup_lr",
+        type=float,
+        help="Starting learning rate for warmup.",
+    )
+    # add arguments for learning rate decay rate
+    validator.add_argument(
+        "lr_decay_rate",
+        type=float,
+        help="Learning rate decay rate. Required if using a decaying learning rate scheduler.",
+    )
+    # add arguments for weight decay
+    validator.add_argument(
+        "weight_decay",
+        type=float,
+        help="Weight decay rate.",
+    )
+    # add arguments for training batch size
+    validator.add_argument(
+        "batch_size_train",
+        type=int,
+        help="Training batch size.",
+    )
+    # add arguments for evaluation batch size
+    validator.add_argument(
+        "batch_size_eval",
+        type=int,
+        help="Evaluation batch size, including validation and testing.",
+    )
+    # add arguments for number of workers for data loading
+    validator.add_argument(
+        "num_workers",
+        help="Number of workers for data loading.",
+    )
+    # add arguments for warm up steps
+    validator.add_argument(
+        "warmup_steps",
+        type=int,
+        help="Number of warmup steps. Required if a warmup schedule is used.",
+    )
+    # add arguments for random seed
+    validator.add_argument(
+        "seed",
+        type=int,
+        help="Random seed.",
+    )
+    # add arguments for output directory
+    validator.add_argument(
+        "output_dir",
+        type=str,
+        help="Output directory to save checkpoints and logs.",
+    )
+    # add arguments for whether only use evaluation
+    validator.add_argument(
+        "evaluate",
+        help="Whether to only evaluate the model. If true, training will not be performed.",
+    )
+    # add arguments for splits used for training, e.g. ["train", "val"]
+    validator.add_argument(
+        "train_splits",
+        type=list,
+        help="Splits to use for training.",
+    )
+    # add arguments for splits used for validation, e.g. ["val"]
+    validator.add_argument(
+        "valid_splits",
+        type=list,
+        help="Splits to use for validation. If not provided, will skip the validation.",
+    )
+    # add arguments for splits used for testing, e.g. ["test"]
+    validator.add_argument(
+        "test_splits",
+        type=list,
+        help="Splits to use for testing. If not provided, will skip the testing.",
+    )
+    # add arguments for accumulating gradient for iterations
+    validator.add_argument(
+        "accum_grad_iters",
+        type=int,
+        help="Number of iterations to accumulate gradient for.",
+    )
+
+    # ====== distributed training ======
+    validator.add_argument(
+        "device",
+        type=str,
+        choices=["cpu", "cuda"],
+        help="Device to use. Support 'cuda' or 'cpu' as for now.",
+    )
+    validator.add_argument(
+        "world_size",
+        type=int,
+        help="Number of processes participating in the job.",
+    )
+    validator.add_argument("dist_url", type=str)
+    validator.add_argument("distributed", type=bool)
+    # add arguments to opt using distributed sampler during evaluation or not
+    validator.add_argument(
+        "use_dist_eval_sampler",
+        type=bool,
+        help="Whether to use distributed sampler during evaluation or not.",
+    )
+
+    # ====== task specific ======
+    # generation task specific arguments
+    # add arguments for maximal length of text output
+    validator.add_argument(
+        "max_len",
+        type=int,
+        help="Maximal length of text output.",
+    )
+    # add arguments for minimal length of text output
+    validator.add_argument(
+        "min_len",
+        type=int,
+        help="Minimal length of text output.",
+    )
+    # add arguments number of beams
+    validator.add_argument(
+        "num_beams",
+        type=int,
+        help="Number of beams used for beam search.",
+    )
+
+    # vqa task specific arguments
+    # add arguments for number of answer candidates
+    validator.add_argument(
+        "num_ans_candidates",
+        type=int,
+        help="""For ALBEF and BLIP, these models first rank answers according to likelihood to select answer candidates.""",
+    )
+    # add arguments for inference method
+    validator.add_argument(
+        "inference_method",
+        type=str,
+        choices=["genearte", "rank"],
+        help="""Inference method to use for question answering. If rank, requires a answer list.""",
+    )
+
+    # ====== model specific ======
+    validator.add_argument(
+        "k_test",
+        type=int,
+        help="Number of top k most similar samples from ITC/VTC selection to be tested.",
+    )
+
+    return validator
diff --git a/models/MiniGPT4/minigpt4/common/dist_utils.py b/models/MiniGPT4/minigpt4/common/dist_utils.py
new file mode 100644
index 0000000..9280150
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/common/dist_utils.py
@@ -0,0 +1,137 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import datetime
+import functools
+import os
+
+import torch
+import torch.distributed as dist
+import timm.models.hub as timm_hub
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def init_distributed_mode(args):
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    elif "SLURM_PROCID" in os.environ:
+        args.rank = int(os.environ["SLURM_PROCID"])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print("Not using distributed mode")
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = "nccl"
+    print(
+        "| distributed init (rank {}, world {}): {}".format(
+            args.rank, args.world_size, args.dist_url
+        ),
+        flush=True,
+    )
+    torch.distributed.init_process_group(
+        backend=args.dist_backend,
+        init_method=args.dist_url,
+        world_size=args.world_size,
+        rank=args.rank,
+        timeout=datetime.timedelta(
+            days=365
+        ),  # allow auto-downloading and de-compressing
+    )
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+
+
+def get_dist_info():
+    if torch.__version__ < "1.0":
+        initialized = dist._initialized
+    else:
+        initialized = dist.is_initialized()
+    if initialized:
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:  # non-distributed training
+        rank = 0
+        world_size = 1
+    return rank, world_size
+
+
+def main_process(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+def download_cached_file(url, check_hash=True, progress=False):
+    """
+    Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
+    If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
+    """
+
+    def get_cached_file_path():
+        # a hack to sync the file path across processes
+        parts = torch.hub.urlparse(url)
+        filename = os.path.basename(parts.path)
+        cached_file = os.path.join(timm_hub.get_cache_dir(), filename)
+
+        return cached_file
+
+    if is_main_process():
+        timm_hub.download_cached_file(url, check_hash, progress)
+
+    if is_dist_avail_and_initialized():
+        dist.barrier()
+
+    return get_cached_file_path()
diff --git a/models/MiniGPT4/minigpt4/common/gradcam.py b/models/MiniGPT4/minigpt4/common/gradcam.py
new file mode 100644
index 0000000..d53a525
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/common/gradcam.py
@@ -0,0 +1,24 @@
+import numpy as np
+from matplotlib import pyplot as plt
+from scipy.ndimage import filters
+from skimage import transform as skimage_transform
+
+
+def getAttMap(img, attMap, blur=True, overlap=True):
+    attMap -= attMap.min()
+    if attMap.max() > 0:
+        attMap /= attMap.max()
+    attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
+    if blur:
+        attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
+        attMap -= attMap.min()
+        attMap /= attMap.max()
+    cmap = plt.get_cmap("jet")
+    attMapV = cmap(attMap)
+    attMapV = np.delete(attMapV, 3, 2)
+    if overlap:
+        attMap = (
+            1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
+            + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
+        )
+    return attMap
diff --git a/models/MiniGPT4/minigpt4/common/logger.py b/models/MiniGPT4/minigpt4/common/logger.py
new file mode 100644
index 0000000..9a5a727
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/common/logger.py
@@ -0,0 +1,195 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import datetime
+import logging
+import time
+from collections import defaultdict, deque
+
+import torch
+import torch.distributed as dist
+
+from minigpt4.common import dist_utils
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not dist_utils.is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value,
+        )
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError(
+            "'{}' object has no attribute '{}'".format(type(self).__name__, attr)
+        )
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {}".format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+
+    def global_avg(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {:.4f}".format(name, meter.global_avg))
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        log_msg = [
+            header,
+            "[{0" + space_fmt + "}/{1}]",
+            "eta: {eta}",
+            "{meters}",
+            "time: {time}",
+            "data: {data}",
+        ]
+        if torch.cuda.is_available():
+            log_msg.append("max mem: {memory:.0f}")
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
+                else:
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                        )
+                    )
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print(
+            "{} Total time: {} ({:.4f} s / it)".format(
+                header, total_time_str, total_time / len(iterable)
+            )
+        )
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def setup_logger():
+    logging.basicConfig(
+        level=logging.INFO if dist_utils.is_main_process() else logging.WARN,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+        handlers=[logging.StreamHandler()],
+    )
diff --git a/models/MiniGPT4/minigpt4/common/optims.py b/models/MiniGPT4/minigpt4/common/optims.py
new file mode 100644
index 0000000..58327f7
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/common/optims.py
@@ -0,0 +1,119 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import math
+
+from minigpt4.common.registry import registry
+
+
+@registry.register_lr_scheduler("linear_warmup_step_lr")
+class LinearWarmupStepLRScheduler:
+    def __init__(
+        self,
+        optimizer,
+        max_epoch,
+        min_lr,
+        init_lr,
+        decay_rate=1,
+        warmup_start_lr=-1,
+        warmup_steps=0,
+        **kwargs
+    ):
+        self.optimizer = optimizer
+
+        self.max_epoch = max_epoch
+        self.min_lr = min_lr
+
+        self.decay_rate = decay_rate
+
+        self.init_lr = init_lr
+        self.warmup_steps = warmup_steps
+        self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
+
+    def step(self, cur_epoch, cur_step):
+        if cur_epoch == 0:
+            warmup_lr_schedule(
+                step=cur_step,
+                optimizer=self.optimizer,
+                max_step=self.warmup_steps,
+                init_lr=self.warmup_start_lr,
+                max_lr=self.init_lr,
+            )
+        else:
+            step_lr_schedule(
+                epoch=cur_epoch,
+                optimizer=self.optimizer,
+                init_lr=self.init_lr,
+                min_lr=self.min_lr,
+                decay_rate=self.decay_rate,
+            )
+
+
+@registry.register_lr_scheduler("linear_warmup_cosine_lr")
+class LinearWarmupCosineLRScheduler:
+    def __init__(
+        self,
+        optimizer,
+        max_epoch,
+        iters_per_epoch,
+        min_lr,
+        init_lr,
+        warmup_steps=0,
+        warmup_start_lr=-1,
+        **kwargs
+    ):
+        self.optimizer = optimizer
+
+        self.max_epoch = max_epoch
+        self.iters_per_epoch = iters_per_epoch
+        self.min_lr = min_lr
+
+        self.init_lr = init_lr
+        self.warmup_steps = warmup_steps
+        self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
+
+    def step(self, cur_epoch, cur_step):
+        total_cur_step = cur_epoch * self.iters_per_epoch + cur_step
+        if total_cur_step < self.warmup_steps:
+            warmup_lr_schedule(
+                step=cur_step,
+                optimizer=self.optimizer,
+                max_step=self.warmup_steps,
+                init_lr=self.warmup_start_lr,
+                max_lr=self.init_lr,
+            )
+        else:
+            cosine_lr_schedule(
+                epoch=total_cur_step,
+                optimizer=self.optimizer,
+                max_epoch=self.max_epoch * self.iters_per_epoch,
+                init_lr=self.init_lr,
+                min_lr=self.min_lr,
+            )
+
+
+def cosine_lr_schedule(optimizer, epoch, max_epoch, init_lr, min_lr):
+    """Decay the learning rate"""
+    lr = (init_lr - min_lr) * 0.5 * (
+        1.0 + math.cos(math.pi * epoch / max_epoch)
+    ) + min_lr
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+
+
+def warmup_lr_schedule(optimizer, step, max_step, init_lr, max_lr):
+    """Warmup the learning rate"""
+    lr = min(max_lr, init_lr + (max_lr - init_lr) * step / max(max_step, 1))
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+
+
+def step_lr_schedule(optimizer, epoch, init_lr, min_lr, decay_rate):
+    """Decay the learning rate"""
+    lr = max(min_lr, init_lr * (decay_rate**epoch))
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
diff --git a/models/MiniGPT4/minigpt4/common/registry.py b/models/MiniGPT4/minigpt4/common/registry.py
new file mode 100644
index 0000000..679467a
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/common/registry.py
@@ -0,0 +1,329 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+
+class Registry:
+    mapping = {
+        "builder_name_mapping": {},
+        "task_name_mapping": {},
+        "processor_name_mapping": {},
+        "model_name_mapping": {},
+        "lr_scheduler_name_mapping": {},
+        "runner_name_mapping": {},
+        "state": {},
+        "paths": {},
+    }
+
+    @classmethod
+    def register_builder(cls, name):
+        r"""Register a dataset builder to registry with key 'name'
+
+        Args:
+            name: Key with which the builder will be registered.
+
+        Usage:
+
+            from minigpt4.common.registry import registry
+            from minigpt4.datasets.base_dataset_builder import BaseDatasetBuilder
+        """
+
+        def wrap(builder_cls):
+            from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
+
+            assert issubclass(
+                builder_cls, BaseDatasetBuilder
+            ), "All builders must inherit BaseDatasetBuilder class, found {}".format(
+                builder_cls
+            )
+            if name in cls.mapping["builder_name_mapping"]:
+                raise KeyError(
+                    "Name '{}' already registered for {}.".format(
+                        name, cls.mapping["builder_name_mapping"][name]
+                    )
+                )
+            cls.mapping["builder_name_mapping"][name] = builder_cls
+            return builder_cls
+
+        return wrap
+
+    @classmethod
+    def register_task(cls, name):
+        r"""Register a task to registry with key 'name'
+
+        Args:
+            name: Key with which the task will be registered.
+
+        Usage:
+
+            from minigpt4.common.registry import registry
+        """
+
+        def wrap(task_cls):
+            from minigpt4.tasks.base_task import BaseTask
+
+            assert issubclass(
+                task_cls, BaseTask
+            ), "All tasks must inherit BaseTask class"
+            if name in cls.mapping["task_name_mapping"]:
+                raise KeyError(
+                    "Name '{}' already registered for {}.".format(
+                        name, cls.mapping["task_name_mapping"][name]
+                    )
+                )
+            cls.mapping["task_name_mapping"][name] = task_cls
+            return task_cls
+
+        return wrap
+
+    @classmethod
+    def register_model(cls, name):
+        r"""Register a task to registry with key 'name'
+
+        Args:
+            name: Key with which the task will be registered.
+
+        Usage:
+
+            from minigpt4.common.registry import registry
+        """
+
+        def wrap(model_cls):
+            from minigpt4.models import BaseModel
+
+            assert issubclass(
+                model_cls, BaseModel
+            ), "All models must inherit BaseModel class"
+            if name in cls.mapping["model_name_mapping"]:
+                raise KeyError(
+                    "Name '{}' already registered for {}.".format(
+                        name, cls.mapping["model_name_mapping"][name]
+                    )
+                )
+            cls.mapping["model_name_mapping"][name] = model_cls
+            return model_cls
+
+        return wrap
+
+    @classmethod
+    def register_processor(cls, name):
+        r"""Register a processor to registry with key 'name'
+
+        Args:
+            name: Key with which the task will be registered.
+
+        Usage:
+
+            from minigpt4.common.registry import registry
+        """
+
+        def wrap(processor_cls):
+            from minigpt4.processors import BaseProcessor
+
+            assert issubclass(
+                processor_cls, BaseProcessor
+            ), "All processors must inherit BaseProcessor class"
+            if name in cls.mapping["processor_name_mapping"]:
+                raise KeyError(
+                    "Name '{}' already registered for {}.".format(
+                        name, cls.mapping["processor_name_mapping"][name]
+                    )
+                )
+            cls.mapping["processor_name_mapping"][name] = processor_cls
+            return processor_cls
+
+        return wrap
+
+    @classmethod
+    def register_lr_scheduler(cls, name):
+        r"""Register a model to registry with key 'name'
+
+        Args:
+            name: Key with which the task will be registered.
+
+        Usage:
+
+            from minigpt4.common.registry import registry
+        """
+
+        def wrap(lr_sched_cls):
+            if name in cls.mapping["lr_scheduler_name_mapping"]:
+                raise KeyError(
+                    "Name '{}' already registered for {}.".format(
+                        name, cls.mapping["lr_scheduler_name_mapping"][name]
+                    )
+                )
+            cls.mapping["lr_scheduler_name_mapping"][name] = lr_sched_cls
+            return lr_sched_cls
+
+        return wrap
+
+    @classmethod
+    def register_runner(cls, name):
+        r"""Register a model to registry with key 'name'
+
+        Args:
+            name: Key with which the task will be registered.
+
+        Usage:
+
+            from minigpt4.common.registry import registry
+        """
+
+        def wrap(runner_cls):
+            if name in cls.mapping["runner_name_mapping"]:
+                raise KeyError(
+                    "Name '{}' already registered for {}.".format(
+                        name, cls.mapping["runner_name_mapping"][name]
+                    )
+                )
+            cls.mapping["runner_name_mapping"][name] = runner_cls
+            return runner_cls
+
+        return wrap
+
+    @classmethod
+    def register_path(cls, name, path):
+        r"""Register a path to registry with key 'name'
+
+        Args:
+            name: Key with which the path will be registered.
+
+        Usage:
+
+            from minigpt4.common.registry import registry
+        """
+        assert isinstance(path, str), "All path must be str."
+        if name in cls.mapping["paths"]:
+            raise KeyError("Name '{}' already registered.".format(name))
+        cls.mapping["paths"][name] = path
+
+    @classmethod
+    def register(cls, name, obj):
+        r"""Register an item to registry with key 'name'
+
+        Args:
+            name: Key with which the item will be registered.
+
+        Usage::
+
+            from minigpt4.common.registry import registry
+
+            registry.register("config", {})
+        """
+        path = name.split(".")
+        current = cls.mapping["state"]
+
+        for part in path[:-1]:
+            if part not in current:
+                current[part] = {}
+            current = current[part]
+
+        current[path[-1]] = obj
+
+    # @classmethod
+    # def get_trainer_class(cls, name):
+    #     return cls.mapping["trainer_name_mapping"].get(name, None)
+
+    @classmethod
+    def get_builder_class(cls, name):
+        return cls.mapping["builder_name_mapping"].get(name, None)
+
+    @classmethod
+    def get_model_class(cls, name):
+        return cls.mapping["model_name_mapping"].get(name, None)
+
+    @classmethod
+    def get_task_class(cls, name):
+        return cls.mapping["task_name_mapping"].get(name, None)
+
+    @classmethod
+    def get_processor_class(cls, name):
+        return cls.mapping["processor_name_mapping"].get(name, None)
+
+    @classmethod
+    def get_lr_scheduler_class(cls, name):
+        return cls.mapping["lr_scheduler_name_mapping"].get(name, None)
+
+    @classmethod
+    def get_runner_class(cls, name):
+        return cls.mapping["runner_name_mapping"].get(name, None)
+
+    @classmethod
+    def list_runners(cls):
+        return sorted(cls.mapping["runner_name_mapping"].keys())
+
+    @classmethod
+    def list_models(cls):
+        return sorted(cls.mapping["model_name_mapping"].keys())
+
+    @classmethod
+    def list_tasks(cls):
+        return sorted(cls.mapping["task_name_mapping"].keys())
+
+    @classmethod
+    def list_processors(cls):
+        return sorted(cls.mapping["processor_name_mapping"].keys())
+
+    @classmethod
+    def list_lr_schedulers(cls):
+        return sorted(cls.mapping["lr_scheduler_name_mapping"].keys())
+
+    @classmethod
+    def list_datasets(cls):
+        return sorted(cls.mapping["builder_name_mapping"].keys())
+
+    @classmethod
+    def get_path(cls, name):
+        return cls.mapping["paths"].get(name, None)
+
+    @classmethod
+    def get(cls, name, default=None, no_warning=False):
+        r"""Get an item from registry with key 'name'
+
+        Args:
+            name (string): Key whose value needs to be retrieved.
+            default: If passed and key is not in registry, default value will
+                     be returned with a warning. Default: None
+            no_warning (bool): If passed as True, warning when key doesn't exist
+                               will not be generated. Useful for MMF's
+                               internal operations. Default: False
+        """
+        original_name = name
+        name = name.split(".")
+        value = cls.mapping["state"]
+        for subname in name:
+            value = value.get(subname, default)
+            if value is default:
+                break
+
+        if (
+            "writer" in cls.mapping["state"]
+            and value == default
+            and no_warning is False
+        ):
+            cls.mapping["state"]["writer"].warning(
+                "Key {} is not present in registry, returning default value "
+                "of {}".format(original_name, default)
+            )
+        return value
+
+    @classmethod
+    def unregister(cls, name):
+        r"""Remove an item from registry with key 'name'
+
+        Args:
+            name: Key which needs to be removed.
+        Usage::
+
+            from mmf.common.registry import registry
+
+            config = registry.unregister("config")
+        """
+        return cls.mapping["state"].pop(name, None)
+
+
+registry = Registry()
diff --git a/models/MiniGPT4/minigpt4/common/utils.py b/models/MiniGPT4/minigpt4/common/utils.py
new file mode 100644
index 0000000..a3069cd
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/common/utils.py
@@ -0,0 +1,424 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import io
+import json
+import logging
+import os
+import pickle
+import re
+import shutil
+import urllib
+import urllib.error
+import urllib.request
+from typing import Optional
+from urllib.parse import urlparse
+
+import numpy as np
+import pandas as pd
+import yaml
+from iopath.common.download import download
+from iopath.common.file_io import file_lock, g_pathmgr
+from minigpt4.common.registry import registry
+from torch.utils.model_zoo import tqdm
+from torchvision.datasets.utils import (
+    check_integrity,
+    download_file_from_google_drive,
+    extract_archive,
+)
+
+
+def now():
+    from datetime import datetime
+
+    return datetime.now().strftime("%Y%m%d%H%M")[:-1]
+
+
+def is_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https")
+
+
+def get_cache_path(rel_path):
+    return os.path.expanduser(os.path.join(registry.get_path("cache_root"), rel_path))
+
+
+def get_abs_path(rel_path):
+    return os.path.join(registry.get_path("library_root"), rel_path)
+
+
+def load_json(filename):
+    with open(filename, "r") as f:
+        return json.load(f)
+
+
+# The following are adapted from torchvision and vissl
+# torchvision: https://github.com/pytorch/vision
+# vissl: https://github.com/facebookresearch/vissl/blob/main/vissl/utils/download.py
+
+
+def makedir(dir_path):
+    """
+    Create the directory if it does not exist.
+    """
+    is_success = False
+    try:
+        if not g_pathmgr.exists(dir_path):
+            g_pathmgr.mkdirs(dir_path)
+        is_success = True
+    except BaseException:
+        print(f"Error creating directory: {dir_path}")
+    return is_success
+
+
+def get_redirected_url(url: str):
+    """
+    Given a URL, returns the URL it redirects to or the
+    original URL in case of no indirection
+    """
+    import requests
+
+    with requests.Session() as session:
+        with session.get(url, stream=True, allow_redirects=True) as response:
+            if response.history:
+                return response.url
+            else:
+                return url
+
+
+def to_google_drive_download_url(view_url: str) -> str:
+    """
+    Utility function to transform a view URL of google drive
+    to a download URL for google drive
+    Example input:
+        https://drive.google.com/file/d/137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp/view
+    Example output:
+        https://drive.google.com/uc?export=download&id=137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp
+    """
+    splits = view_url.split("/")
+    assert splits[-1] == "view"
+    file_id = splits[-2]
+    return f"https://drive.google.com/uc?export=download&id={file_id}"
+
+
+def download_google_drive_url(url: str, output_path: str, output_file_name: str):
+    """
+    Download a file from google drive
+    Downloading an URL from google drive requires confirmation when
+    the file of the size is too big (google drive notifies that
+    anti-viral checks cannot be performed on such files)
+    """
+    import requests
+
+    with requests.Session() as session:
+
+        # First get the confirmation token and append it to the URL
+        with session.get(url, stream=True, allow_redirects=True) as response:
+            for k, v in response.cookies.items():
+                if k.startswith("download_warning"):
+                    url = url + "&confirm=" + v
+
+        # Then download the content of the file
+        with session.get(url, stream=True, verify=True) as response:
+            makedir(output_path)
+            path = os.path.join(output_path, output_file_name)
+            total_size = int(response.headers.get("Content-length", 0))
+            with open(path, "wb") as file:
+                from tqdm import tqdm
+
+                with tqdm(total=total_size) as progress_bar:
+                    for block in response.iter_content(
+                        chunk_size=io.DEFAULT_BUFFER_SIZE
+                    ):
+                        file.write(block)
+                        progress_bar.update(len(block))
+
+
+def _get_google_drive_file_id(url: str) -> Optional[str]:
+    parts = urlparse(url)
+
+    if re.match(r"(drive|docs)[.]google[.]com", parts.netloc) is None:
+        return None
+
+    match = re.match(r"/file/d/(?P<id>[^/]*)", parts.path)
+    if match is None:
+        return None
+
+    return match.group("id")
+
+
+def _urlretrieve(url: str, filename: str, chunk_size: int = 1024) -> None:
+    with open(filename, "wb") as fh:
+        with urllib.request.urlopen(
+            urllib.request.Request(url, headers={"User-Agent": "vissl"})
+        ) as response:
+            with tqdm(total=response.length) as pbar:
+                for chunk in iter(lambda: response.read(chunk_size), ""):
+                    if not chunk:
+                        break
+                    pbar.update(chunk_size)
+                    fh.write(chunk)
+
+
+def download_url(
+    url: str,
+    root: str,
+    filename: Optional[str] = None,
+    md5: Optional[str] = None,
+) -> None:
+    """Download a file from a url and place it in root.
+    Args:
+        url (str): URL to download file from
+        root (str): Directory to place downloaded file in
+        filename (str, optional): Name to save the file under.
+                                  If None, use the basename of the URL.
+        md5 (str, optional): MD5 checksum of the download. If None, do not check
+    """
+    root = os.path.expanduser(root)
+    if not filename:
+        filename = os.path.basename(url)
+    fpath = os.path.join(root, filename)
+
+    makedir(root)
+
+    # check if file is already present locally
+    if check_integrity(fpath, md5):
+        print("Using downloaded and verified file: " + fpath)
+        return
+
+    # expand redirect chain if needed
+    url = get_redirected_url(url)
+
+    # check if file is located on Google Drive
+    file_id = _get_google_drive_file_id(url)
+    if file_id is not None:
+        return download_file_from_google_drive(file_id, root, filename, md5)
+
+    # download the file
+    try:
+        print("Downloading " + url + " to " + fpath)
+        _urlretrieve(url, fpath)
+    except (urllib.error.URLError, IOError) as e:  # type: ignore[attr-defined]
+        if url[:5] == "https":
+            url = url.replace("https:", "http:")
+            print(
+                "Failed download. Trying https -> http instead."
+                " Downloading " + url + " to " + fpath
+            )
+            _urlretrieve(url, fpath)
+        else:
+            raise e
+
+    # check integrity of downloaded file
+    if not check_integrity(fpath, md5):
+        raise RuntimeError("File not found or corrupted.")
+
+
+def download_and_extract_archive(
+    url: str,
+    download_root: str,
+    extract_root: Optional[str] = None,
+    filename: Optional[str] = None,
+    md5: Optional[str] = None,
+    remove_finished: bool = False,
+) -> None:
+    download_root = os.path.expanduser(download_root)
+    if extract_root is None:
+        extract_root = download_root
+    if not filename:
+        filename = os.path.basename(url)
+
+    download_url(url, download_root, filename, md5)
+
+    archive = os.path.join(download_root, filename)
+    print("Extracting {} to {}".format(archive, extract_root))
+    extract_archive(archive, extract_root, remove_finished)
+
+
+def cache_url(url: str, cache_dir: str) -> str:
+    """
+    This implementation downloads the remote resource and caches it locally.
+    The resource will only be downloaded if not previously requested.
+    """
+    parsed_url = urlparse(url)
+    dirname = os.path.join(cache_dir, os.path.dirname(parsed_url.path.lstrip("/")))
+    makedir(dirname)
+    filename = url.split("/")[-1]
+    cached = os.path.join(dirname, filename)
+    with file_lock(cached):
+        if not os.path.isfile(cached):
+            logging.info(f"Downloading {url} to {cached} ...")
+            cached = download(url, dirname, filename=filename)
+    logging.info(f"URL {url} cached in {cached}")
+    return cached
+
+
+# TODO (prigoyal): convert this into RAII-style API
+def create_file_symlink(file1, file2):
+    """
+    Simply create the symlinks for a given file1 to file2.
+    Useful during model checkpointing to symlinks to the
+    latest successful checkpoint.
+    """
+    try:
+        if g_pathmgr.exists(file2):
+            g_pathmgr.rm(file2)
+        g_pathmgr.symlink(file1, file2)
+    except Exception as e:
+        logging.info(f"Could NOT create symlink. Error: {e}")
+
+
+def save_file(data, filename, append_to_json=True, verbose=True):
+    """
+    Common i/o utility to handle saving data to various file formats.
+    Supported:
+        .pkl, .pickle, .npy, .json
+    Specifically for .json, users have the option to either append (default)
+    or rewrite by passing in Boolean value to append_to_json.
+    """
+    if verbose:
+        logging.info(f"Saving data to file: {filename}")
+    file_ext = os.path.splitext(filename)[1]
+    if file_ext in [".pkl", ".pickle"]:
+        with g_pathmgr.open(filename, "wb") as fopen:
+            pickle.dump(data, fopen, pickle.HIGHEST_PROTOCOL)
+    elif file_ext == ".npy":
+        with g_pathmgr.open(filename, "wb") as fopen:
+            np.save(fopen, data)
+    elif file_ext == ".json":
+        if append_to_json:
+            with g_pathmgr.open(filename, "a") as fopen:
+                fopen.write(json.dumps(data, sort_keys=True) + "\n")
+                fopen.flush()
+        else:
+            with g_pathmgr.open(filename, "w") as fopen:
+                fopen.write(json.dumps(data, sort_keys=True) + "\n")
+                fopen.flush()
+    elif file_ext == ".yaml":
+        with g_pathmgr.open(filename, "w") as fopen:
+            dump = yaml.dump(data)
+            fopen.write(dump)
+            fopen.flush()
+    else:
+        raise Exception(f"Saving {file_ext} is not supported yet")
+
+    if verbose:
+        logging.info(f"Saved data to file: {filename}")
+
+
+def load_file(filename, mmap_mode=None, verbose=True, allow_pickle=False):
+    """
+    Common i/o utility to handle loading data from various file formats.
+    Supported:
+        .pkl, .pickle, .npy, .json
+    For the npy files, we support reading the files in mmap_mode.
+    If the mmap_mode of reading is not successful, we load data without the
+    mmap_mode.
+    """
+    if verbose:
+        logging.info(f"Loading data from file: {filename}")
+
+    file_ext = os.path.splitext(filename)[1]
+    if file_ext == ".txt":
+        with g_pathmgr.open(filename, "r") as fopen:
+            data = fopen.readlines()
+    elif file_ext in [".pkl", ".pickle"]:
+        with g_pathmgr.open(filename, "rb") as fopen:
+            data = pickle.load(fopen, encoding="latin1")
+    elif file_ext == ".npy":
+        if mmap_mode:
+            try:
+                with g_pathmgr.open(filename, "rb") as fopen:
+                    data = np.load(
+                        fopen,
+                        allow_pickle=allow_pickle,
+                        encoding="latin1",
+                        mmap_mode=mmap_mode,
+                    )
+            except ValueError as e:
+                logging.info(
+                    f"Could not mmap {filename}: {e}. Trying without g_pathmgr"
+                )
+                data = np.load(
+                    filename,
+                    allow_pickle=allow_pickle,
+                    encoding="latin1",
+                    mmap_mode=mmap_mode,
+                )
+                logging.info("Successfully loaded without g_pathmgr")
+            except Exception:
+                logging.info("Could not mmap without g_pathmgr. Trying without mmap")
+                with g_pathmgr.open(filename, "rb") as fopen:
+                    data = np.load(fopen, allow_pickle=allow_pickle, encoding="latin1")
+        else:
+            with g_pathmgr.open(filename, "rb") as fopen:
+                data = np.load(fopen, allow_pickle=allow_pickle, encoding="latin1")
+    elif file_ext == ".json":
+        with g_pathmgr.open(filename, "r") as fopen:
+            data = json.load(fopen)
+    elif file_ext == ".yaml":
+        with g_pathmgr.open(filename, "r") as fopen:
+            data = yaml.load(fopen, Loader=yaml.FullLoader)
+    elif file_ext == ".csv":
+        with g_pathmgr.open(filename, "r") as fopen:
+            data = pd.read_csv(fopen)
+    else:
+        raise Exception(f"Reading from {file_ext} is not supported yet")
+    return data
+
+
+def abspath(resource_path: str):
+    """
+    Make a path absolute, but take into account prefixes like
+    "http://" or "manifold://"
+    """
+    regex = re.compile(r"^\w+://")
+    if regex.match(resource_path) is None:
+        return os.path.abspath(resource_path)
+    else:
+        return resource_path
+
+
+def makedir(dir_path):
+    """
+    Create the directory if it does not exist.
+    """
+    is_success = False
+    try:
+        if not g_pathmgr.exists(dir_path):
+            g_pathmgr.mkdirs(dir_path)
+        is_success = True
+    except BaseException:
+        logging.info(f"Error creating directory: {dir_path}")
+    return is_success
+
+
+def is_url(input_url):
+    """
+    Check if an input string is a url. look for http(s):// and ignoring the case
+    """
+    is_url = re.match(r"^(?:http)s?://", input_url, re.IGNORECASE) is not None
+    return is_url
+
+
+def cleanup_dir(dir):
+    """
+    Utility for deleting a directory. Useful for cleaning the storage space
+    that contains various training artifacts like checkpoints, data etc.
+    """
+    if os.path.exists(dir):
+        logging.info(f"Deleting directory: {dir}")
+        shutil.rmtree(dir)
+    logging.info(f"Deleted contents of directory: {dir}")
+
+
+def get_file_size(filename):
+    """
+    Given a file, get the size of file in MB
+    """
+    size_in_mb = os.path.getsize(filename) / float(1024**2)
+    return size_in_mb
diff --git a/models/MiniGPT4/minigpt4/configs/datasets/cc_sbu/align.yaml b/models/MiniGPT4/minigpt4/configs/datasets/cc_sbu/align.yaml
new file mode 100644
index 0000000..5710834
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/configs/datasets/cc_sbu/align.yaml
@@ -0,0 +1,5 @@
+datasets:
+  cc_sbu_align:
+    data_type: images
+    build_info:
+      storage: /path/to/cc_sbu_align/
diff --git a/models/MiniGPT4/minigpt4/configs/datasets/cc_sbu/defaults.yaml b/models/MiniGPT4/minigpt4/configs/datasets/cc_sbu/defaults.yaml
new file mode 100644
index 0000000..60390ee
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/configs/datasets/cc_sbu/defaults.yaml
@@ -0,0 +1,5 @@
+datasets:
+  cc_sbu:
+    data_type: images
+    build_info:
+      storage: /path/to/cc_sbu_dataset/{00000..01255}.tar
diff --git a/models/MiniGPT4/minigpt4/configs/datasets/laion/defaults.yaml b/models/MiniGPT4/minigpt4/configs/datasets/laion/defaults.yaml
new file mode 100644
index 0000000..6bad629
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/configs/datasets/laion/defaults.yaml
@@ -0,0 +1,5 @@
+datasets:
+  laion:
+    data_type: images
+    build_info:
+      storage: /path/to/laion_dataset/{00000..10488}.tar
diff --git a/models/MiniGPT4/minigpt4/configs/default.yaml b/models/MiniGPT4/minigpt4/configs/default.yaml
new file mode 100644
index 0000000..ff5a6a2
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/configs/default.yaml
@@ -0,0 +1,5 @@
+env:
+  # For default users
+  # cache_root: "cache"
+  # For internal use with persistent storage
+  cache_root: "/export/home/.cache/minigpt4"
diff --git a/models/MiniGPT4/minigpt4/configs/models/minigpt4.yaml b/models/MiniGPT4/minigpt4/configs/models/minigpt4.yaml
new file mode 100644
index 0000000..abf38a9
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/configs/models/minigpt4.yaml
@@ -0,0 +1,33 @@
+model:
+  arch: mini_gpt4
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+  freeze_qformer: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # Vicuna
+  llama_model: "./models/MiniGPT4/model_weight/model"
+
+  # generation configs
+  prompt: ""
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip2_image_train"
+          image_size: 224
+        eval:
+          name: "blip2_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
diff --git a/models/MiniGPT4/minigpt4/conversation/__init__.py b/models/MiniGPT4/minigpt4/conversation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/MiniGPT4/minigpt4/conversation/__pycache__/__init__.cpython-310.pyc b/models/MiniGPT4/minigpt4/conversation/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..9d9ab35
Binary files /dev/null and b/models/MiniGPT4/minigpt4/conversation/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/conversation/__pycache__/conversation.cpython-310.pyc b/models/MiniGPT4/minigpt4/conversation/__pycache__/conversation.cpython-310.pyc
new file mode 100644
index 0000000..5aaafe4
Binary files /dev/null and b/models/MiniGPT4/minigpt4/conversation/__pycache__/conversation.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/conversation/conversation.py b/models/MiniGPT4/minigpt4/conversation/conversation.py
new file mode 100644
index 0000000..676d89f
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/conversation/conversation.py
@@ -0,0 +1,207 @@
+import argparse
+import time
+from PIL import Image
+
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
+from transformers import StoppingCriteria, StoppingCriteriaList
+
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple, Any
+
+from minigpt4.common.registry import registry
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    # system_img: List[Image.Image] = []
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+
+    skip_next: bool = False
+    conv_id: Any = None
+
+    def get_prompt(self):
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            # system_img=self.system_img,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            conv_id=self.conv_id)
+
+    def dict(self):
+        return {
+            "system": self.system,
+            # "system_img": self.system_img,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+            "conv_id": self.conv_id,
+        }
+
+
+class StoppingCriteriaSub(StoppingCriteria):
+
+    def __init__(self, stops=[], encounters=1):
+        super().__init__()
+        self.stops = stops
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        for stop in self.stops:
+            if torch.all((stop == input_ids[0][-len(stop):])).item():
+                return True
+
+        return False
+
+
+CONV_VISION = Conversation(
+    system="Give the following image: <Img>ImageContent</Img>. "
+           "You will be able to see the image once I provide it to you. Please answer my questions.",
+    roles=("Human", "Assistant"),
+    messages=[],
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+
+
+class Chat:
+    def __init__(self, model, vis_processor, device='cuda:0'):
+        self.device = device
+        self.model = model
+        self.vis_processor = vis_processor
+        stop_words_ids = [torch.tensor([835]).to(self.device),
+                          torch.tensor([2277, 29937]).to(self.device)]  # '###' can be encoded in two different ways.
+        self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+
+    def ask(self, text, conv):
+        if len(conv.messages) > 0 and conv.messages[-1][0] == conv.roles[0] \
+                and conv.messages[-1][1][-6:] == '</Img>':  # last message is image.
+            conv.messages[-1][1] = ' '.join([conv.messages[-1][1], text])
+        else:
+            conv.append_message(conv.roles[0], text)
+
+    def answer(self, conv, img_list, max_new_tokens=300, num_beams=1, min_length=1, top_p=0.9,
+               repetition_penalty=1.0, length_penalty=1, temperature=1.0, max_length=2000):
+        conv.append_message(conv.roles[1], None)
+        embs = self.get_context_emb(conv, img_list)
+
+        current_max_len = embs.shape[1] + max_new_tokens
+        if current_max_len - max_length > 0:
+            print('Warning: The number of tokens in current conversation exceeds the max length. '
+                  'The model will not see the contexts outside the range.')
+        begin_idx = max(0, current_max_len - max_length)
+
+        embs = embs[:, begin_idx:]
+
+        outputs = self.model.llama_model.generate(
+            inputs_embeds=embs,
+            max_new_tokens=max_new_tokens,
+            stopping_criteria=self.stopping_criteria,
+            num_beams=num_beams,
+            do_sample=True,
+            min_length=min_length,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            temperature=temperature,
+        )
+        output_token = outputs[0]
+        if output_token[0] == 0:  # the model might output a unknow token <unk> at the beginning. remove it
+            output_token = output_token[1:]
+        if output_token[0] == 1:  # some users find that there is a start token <s> at the beginning. remove it
+            output_token = output_token[1:]
+        output_text = self.model.llama_tokenizer.decode(output_token, add_special_tokens=False)
+        output_text = output_text.split('###')[0]  # remove the stop sign '###'
+        output_text = output_text.split('Assistant:')[-1].strip()
+        conv.messages[-1][1] = output_text
+        return output_text, output_token.cpu().numpy()
+
+    def upload_img(self, image, conv, img_list):
+        if isinstance(image, str):  # is a image path
+            raw_image = Image.open(image).convert('RGB')
+            image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
+        elif isinstance(image, Image.Image):
+            raw_image = image
+            image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
+        elif isinstance(image, torch.Tensor):
+            if len(image.shape) == 3:
+                image = image.unsqueeze(0)
+            image = image.to(self.device)
+
+        image_emb, _ = self.model.encode_img(image)
+        img_list.append(image_emb)
+        conv.append_message(conv.roles[0], "<Img><ImageHere></Img>")
+        msg = "Received."
+        # self.conv.append_message(self.conv.roles[1], msg)
+        return msg
+
+    def get_context_emb(self, conv, img_list):
+        prompt = conv.get_prompt()
+        prompt_segs = prompt.split('<ImageHere>')
+        assert len(prompt_segs) == len(img_list) + 1, "Unmatched numbers of image placeholders and images."
+        seg_tokens = [
+            self.model.llama_tokenizer(
+                seg, return_tensors="pt", add_special_tokens=i == 0).to(self.device).input_ids
+            # only add bos to the first seg
+            for i, seg in enumerate(prompt_segs)
+        ]
+        seg_embs = [self.model.llama_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
+        mixed_embs = [emb for pair in zip(seg_embs[:-1], img_list) for emb in pair] + [seg_embs[-1]]
+        mixed_embs = torch.cat(mixed_embs, dim=1)
+        return mixed_embs
+
+
diff --git a/models/MiniGPT4/minigpt4/datasets/__init__.py b/models/MiniGPT4/minigpt4/datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/MiniGPT4/minigpt4/datasets/__pycache__/__init__.cpython-310.pyc b/models/MiniGPT4/minigpt4/datasets/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..3e4f608
Binary files /dev/null and b/models/MiniGPT4/minigpt4/datasets/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/datasets/__pycache__/data_utils.cpython-310.pyc b/models/MiniGPT4/minigpt4/datasets/__pycache__/data_utils.cpython-310.pyc
new file mode 100644
index 0000000..b8e5611
Binary files /dev/null and b/models/MiniGPT4/minigpt4/datasets/__pycache__/data_utils.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/datasets/builders/__init__.py b/models/MiniGPT4/minigpt4/datasets/builders/__init__.py
new file mode 100644
index 0000000..6869113
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/datasets/builders/__init__.py
@@ -0,0 +1,71 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+from minigpt4.datasets.builders.base_dataset_builder import load_dataset_config
+from minigpt4.datasets.builders.image_text_pair_builder import (
+    CCSBUBuilder,
+    LaionBuilder,
+    CCSBUAlignBuilder
+)
+from minigpt4.common.registry import registry
+
+__all__ = [
+    "CCSBUBuilder",
+    "LaionBuilder",
+    "CCSBUAlignBuilder"
+]
+
+
+def load_dataset(name, cfg_path=None, vis_path=None, data_type=None):
+    """
+    Example
+
+    >>> dataset = load_dataset("coco_caption", cfg=None)
+    >>> splits = dataset.keys()
+    >>> print([len(dataset[split]) for split in splits])
+
+    """
+    if cfg_path is None:
+        cfg = None
+    else:
+        cfg = load_dataset_config(cfg_path)
+
+    try:
+        builder = registry.get_builder_class(name)(cfg)
+    except TypeError:
+        print(
+            f"Dataset {name} not found. Available datasets:\n"
+            + ", ".join([str(k) for k in dataset_zoo.get_names()])
+        )
+        exit(1)
+
+    if vis_path is not None:
+        if data_type is None:
+            # use default data type in the config
+            data_type = builder.config.data_type
+
+        assert (
+            data_type in builder.config.build_info
+        ), f"Invalid data_type {data_type} for {name}."
+
+        builder.config.build_info.get(data_type).storage = vis_path
+
+    dataset = builder.build_datasets()
+    return dataset
+
+
+class DatasetZoo:
+    def __init__(self) -> None:
+        self.dataset_zoo = {
+            k: list(v.DATASET_CONFIG_DICT.keys())
+            for k, v in sorted(registry.mapping["builder_name_mapping"].items())
+        }
+
+    def get_names(self):
+        return list(self.dataset_zoo.keys())
+
+
+dataset_zoo = DatasetZoo()
diff --git a/models/MiniGPT4/minigpt4/datasets/builders/__pycache__/__init__.cpython-310.pyc b/models/MiniGPT4/minigpt4/datasets/builders/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..bc231ec
Binary files /dev/null and b/models/MiniGPT4/minigpt4/datasets/builders/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/datasets/builders/__pycache__/base_dataset_builder.cpython-310.pyc b/models/MiniGPT4/minigpt4/datasets/builders/__pycache__/base_dataset_builder.cpython-310.pyc
new file mode 100644
index 0000000..82b1f35
Binary files /dev/null and b/models/MiniGPT4/minigpt4/datasets/builders/__pycache__/base_dataset_builder.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/datasets/builders/__pycache__/image_text_pair_builder.cpython-310.pyc b/models/MiniGPT4/minigpt4/datasets/builders/__pycache__/image_text_pair_builder.cpython-310.pyc
new file mode 100644
index 0000000..cca792b
Binary files /dev/null and b/models/MiniGPT4/minigpt4/datasets/builders/__pycache__/image_text_pair_builder.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/datasets/builders/base_dataset_builder.py b/models/MiniGPT4/minigpt4/datasets/builders/base_dataset_builder.py
new file mode 100644
index 0000000..4b607e3
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/datasets/builders/base_dataset_builder.py
@@ -0,0 +1,236 @@
+"""
+ This file is from
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import logging
+import os
+import shutil
+import warnings
+
+from omegaconf import OmegaConf
+import torch.distributed as dist
+from torchvision.datasets.utils import download_url
+
+import minigpt4.common.utils as utils
+from minigpt4.common.dist_utils import is_dist_avail_and_initialized, is_main_process
+from minigpt4.common.registry import registry
+from minigpt4.processors.base_processor import BaseProcessor
+
+
+
+class BaseDatasetBuilder:
+    train_dataset_cls, eval_dataset_cls = None, None
+
+    def __init__(self, cfg=None):
+        super().__init__()
+
+        if cfg is None:
+            # help to create datasets from default config.
+            self.config = load_dataset_config(self.default_config_path())
+        elif isinstance(cfg, str):
+            self.config = load_dataset_config(cfg)
+        else:
+            # when called from task.build_dataset()
+            self.config = cfg
+
+        self.data_type = self.config.data_type
+
+        self.vis_processors = {"train": BaseProcessor(), "eval": BaseProcessor()}
+        self.text_processors = {"train": BaseProcessor(), "eval": BaseProcessor()}
+
+    def build_datasets(self):
+        # download, split, etc...
+        # only called on 1 GPU/TPU in distributed
+
+        if is_main_process():
+            self._download_data()
+
+        if is_dist_avail_and_initialized():
+            dist.barrier()
+
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        datasets = self.build()  # dataset['train'/'val'/'test']
+
+        return datasets
+
+    def build_processors(self):
+        vis_proc_cfg = self.config.get("vis_processor")
+        txt_proc_cfg = self.config.get("text_processor")
+
+        if vis_proc_cfg is not None:
+            vis_train_cfg = vis_proc_cfg.get("train")
+            vis_eval_cfg = vis_proc_cfg.get("eval")
+
+            self.vis_processors["train"] = self._build_proc_from_cfg(vis_train_cfg)
+            self.vis_processors["eval"] = self._build_proc_from_cfg(vis_eval_cfg)
+
+        if txt_proc_cfg is not None:
+            txt_train_cfg = txt_proc_cfg.get("train")
+            txt_eval_cfg = txt_proc_cfg.get("eval")
+
+            self.text_processors["train"] = self._build_proc_from_cfg(txt_train_cfg)
+            self.text_processors["eval"] = self._build_proc_from_cfg(txt_eval_cfg)
+
+    @staticmethod
+    def _build_proc_from_cfg(cfg):
+        return (
+            registry.get_processor_class(cfg.name).from_config(cfg)
+            if cfg is not None
+            else None
+        )
+
+    @classmethod
+    def default_config_path(cls, type="default"):
+        return utils.get_abs_path(cls.DATASET_CONFIG_DICT[type])
+
+    def _download_data(self):
+        self._download_ann()
+        self._download_vis()
+
+    def _download_ann(self):
+        """
+        Download annotation files if necessary.
+        All the vision-language datasets should have annotations of unified format.
+
+        storage_path can be:
+          (1) relative/absolute: will be prefixed with env.cache_root to make full path if relative.
+          (2) basename/dirname: will be suffixed with base name of URL if dirname is provided.
+
+        Local annotation paths should be relative.
+        """
+        anns = self.config.build_info.annotations
+
+        splits = anns.keys()
+
+        cache_root = registry.get_path("cache_root")
+
+        for split in splits:
+            info = anns[split]
+
+            urls, storage_paths = info.get("url", None), info.storage
+
+            if isinstance(urls, str):
+                urls = [urls]
+            if isinstance(storage_paths, str):
+                storage_paths = [storage_paths]
+
+            assert len(urls) == len(storage_paths)
+
+            for url_or_filename, storage_path in zip(urls, storage_paths):
+                # if storage_path is relative, make it full by prefixing with cache_root.
+                if not os.path.isabs(storage_path):
+                    storage_path = os.path.join(cache_root, storage_path)
+
+                dirname = os.path.dirname(storage_path)
+                if not os.path.exists(dirname):
+                    os.makedirs(dirname)
+
+                if os.path.isfile(url_or_filename):
+                    src, dst = url_or_filename, storage_path
+                    if not os.path.exists(dst):
+                        shutil.copyfile(src=src, dst=dst)
+                    else:
+                        logging.info("Using existing file {}.".format(dst))
+                else:
+                    if os.path.isdir(storage_path):
+                        # if only dirname is provided, suffix with basename of URL.
+                        raise ValueError(
+                            "Expecting storage_path to be a file path, got directory {}".format(
+                                storage_path
+                            )
+                        )
+                    else:
+                        filename = os.path.basename(storage_path)
+
+                    download_url(url=url_or_filename, root=dirname, filename=filename)
+
+    def _download_vis(self):
+
+        storage_path = self.config.build_info.get(self.data_type).storage
+        storage_path = utils.get_cache_path(storage_path)
+
+        if not os.path.exists(storage_path):
+            warnings.warn(
+                f"""
+                The specified path {storage_path} for visual inputs does not exist.
+                Please provide a correct path to the visual inputs or
+                refer to datasets/download_scripts/README.md for downloading instructions.
+                """
+            )
+
+    def build(self):
+        """
+        Create by split datasets inheriting torch.utils.data.Datasets.
+
+        # build() can be dataset-specific. Overwrite to customize.
+        """
+        self.build_processors()
+
+        build_info = self.config.build_info
+
+        ann_info = build_info.annotations
+        vis_info = build_info.get(self.data_type)
+
+        datasets = dict()
+        for split in ann_info.keys():
+            if split not in ["train", "val", "test"]:
+                continue
+
+            is_train = split == "train"
+
+            # processors
+            vis_processor = (
+                self.vis_processors["train"]
+                if is_train
+                else self.vis_processors["eval"]
+            )
+            text_processor = (
+                self.text_processors["train"]
+                if is_train
+                else self.text_processors["eval"]
+            )
+
+            # annotation path
+            ann_paths = ann_info.get(split).storage
+            if isinstance(ann_paths, str):
+                ann_paths = [ann_paths]
+
+            abs_ann_paths = []
+            for ann_path in ann_paths:
+                if not os.path.isabs(ann_path):
+                    ann_path = utils.get_cache_path(ann_path)
+                abs_ann_paths.append(ann_path)
+            ann_paths = abs_ann_paths
+
+            # visual data storage path
+            vis_path = os.path.join(vis_info.storage, split)
+
+            if not os.path.isabs(vis_path):
+                # vis_path = os.path.join(utils.get_cache_path(), vis_path)
+                vis_path = utils.get_cache_path(vis_path)
+
+            if not os.path.exists(vis_path):
+                warnings.warn("storage path {} does not exist.".format(vis_path))
+
+            # create datasets
+            dataset_cls = self.train_dataset_cls if is_train else self.eval_dataset_cls
+            datasets[split] = dataset_cls(
+                vis_processor=vis_processor,
+                text_processor=text_processor,
+                ann_paths=ann_paths,
+                vis_root=vis_path,
+            )
+
+        return datasets
+
+
+def load_dataset_config(cfg_path):
+    cfg = OmegaConf.load(cfg_path).datasets
+    cfg = cfg[list(cfg.keys())[0]]
+
+    return cfg
diff --git a/models/MiniGPT4/minigpt4/datasets/builders/image_text_pair_builder.py b/models/MiniGPT4/minigpt4/datasets/builders/image_text_pair_builder.py
new file mode 100644
index 0000000..e5d66b8
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/datasets/builders/image_text_pair_builder.py
@@ -0,0 +1,105 @@
+import os
+import logging
+import warnings
+
+from minigpt4.common.registry import registry
+from minigpt4.datasets.builders.base_dataset_builder import BaseDatasetBuilder
+from minigpt4.datasets.datasets.laion_dataset import LaionDataset
+from minigpt4.datasets.datasets.cc_sbu_dataset import CCSBUDataset, CCSBUAlignDataset
+
+
+@registry.register_builder("cc_sbu")
+class CCSBUBuilder(BaseDatasetBuilder):
+    train_dataset_cls = CCSBUDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/cc_sbu/defaults.yaml"}
+
+    def _download_ann(self):
+        pass
+
+    def _download_vis(self):
+        pass
+
+    def build(self):
+        self.build_processors()
+
+        build_info = self.config.build_info
+
+        datasets = dict()
+        split = "train"
+
+        # create datasets
+        # [NOTE] return inner_datasets (wds.DataPipeline)
+        dataset_cls = self.train_dataset_cls
+        datasets[split] = dataset_cls(
+            vis_processor=self.vis_processors[split],
+            text_processor=self.text_processors[split],
+            location=build_info.storage,
+        ).inner_dataset
+
+        return datasets
+
+
+@registry.register_builder("laion")
+class LaionBuilder(BaseDatasetBuilder):
+    train_dataset_cls = LaionDataset
+
+    DATASET_CONFIG_DICT = {"default": "configs/datasets/laion/defaults.yaml"}
+
+    def _download_ann(self):
+        pass
+
+    def _download_vis(self):
+        pass
+
+    def build(self):
+        self.build_processors()
+
+        build_info = self.config.build_info
+
+        datasets = dict()
+        split = "train"
+
+        # create datasets
+        # [NOTE] return inner_datasets (wds.DataPipeline)
+        dataset_cls = self.train_dataset_cls
+        datasets[split] = dataset_cls(
+            vis_processor=self.vis_processors[split],
+            text_processor=self.text_processors[split],
+            location=build_info.storage,
+        ).inner_dataset
+
+        return datasets
+
+
+@registry.register_builder("cc_sbu_align")
+class CCSBUAlignBuilder(BaseDatasetBuilder):
+    train_dataset_cls = CCSBUAlignDataset
+
+    DATASET_CONFIG_DICT = {
+        "default": "configs/datasets/cc_sbu/align.yaml",
+    }
+
+    def build_datasets(self):
+        # at this point, all the annotations and image/videos should be all downloaded to the specified locations.
+        logging.info("Building datasets...")
+        self.build_processors()
+
+        build_info = self.config.build_info
+        storage_path = build_info.storage
+
+        datasets = dict()
+
+        if not os.path.exists(storage_path):
+            warnings.warn("storage path {} does not exist.".format(storage_path))
+
+        # create datasets
+        dataset_cls = self.train_dataset_cls
+        datasets['train'] = dataset_cls(
+            vis_processor=self.vis_processors["train"],
+            text_processor=self.text_processors["train"],
+            ann_paths=[os.path.join(storage_path, 'filter_cap.json')],
+            vis_root=os.path.join(storage_path, 'image'),
+        )
+
+        return datasets
diff --git a/models/MiniGPT4/minigpt4/datasets/data_utils.py b/models/MiniGPT4/minigpt4/datasets/data_utils.py
new file mode 100644
index 0000000..cf6497f
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/datasets/data_utils.py
@@ -0,0 +1,196 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import gzip
+import logging
+import os
+import random as rnd
+import tarfile
+import zipfile
+import random
+from typing import List
+from tqdm import tqdm
+
+import decord
+from decord import VideoReader
+import webdataset as wds
+import numpy as np
+import torch
+from torch.utils.data.dataset import IterableDataset
+
+from minigpt4.common.registry import registry
+from minigpt4.datasets.datasets.base_dataset import ConcatDataset
+
+
+decord.bridge.set_bridge("torch")
+MAX_INT = registry.get("MAX_INT")
+
+
+class ChainDataset(wds.DataPipeline):
+    r"""Dataset for chaining multiple :class:`DataPipeline` s.
+
+    This class is useful to assemble different existing dataset streams. The
+    chaining operation is done on-the-fly, so concatenating large-scale
+    datasets with this class will be efficient.
+
+    Args:
+        datasets (iterable of IterableDataset): datasets to be chained together
+    """
+    def __init__(self, datasets: List[wds.DataPipeline]) -> None:
+        super().__init__()
+        self.datasets = datasets
+        self.prob = []
+        self.names = []
+        for dataset in self.datasets:
+            if hasattr(dataset, 'name'):
+                self.names.append(dataset.name)
+            else:
+                self.names.append('Unknown')
+            if hasattr(dataset, 'sample_ratio'):
+                self.prob.append(dataset.sample_ratio)
+            else:
+                self.prob.append(1)
+                logging.info("One of the datapipeline doesn't define ratio and set to 1 automatically.")
+
+    def __iter__(self):
+        datastreams = [iter(dataset) for dataset in self.datasets]
+        while True:
+            select_datastream = random.choices(datastreams, weights=self.prob, k=1)[0]
+            yield next(select_datastream)
+
+
+def apply_to_sample(f, sample):
+    if len(sample) == 0:
+        return {}
+
+    def _apply(x):
+        if torch.is_tensor(x):
+            return f(x)
+        elif isinstance(x, dict):
+            return {key: _apply(value) for key, value in x.items()}
+        elif isinstance(x, list):
+            return [_apply(x) for x in x]
+        else:
+            return x
+
+    return _apply(sample)
+
+
+def move_to_cuda(sample):
+    def _move_to_cuda(tensor):
+        return tensor.cuda()
+
+    return apply_to_sample(_move_to_cuda, sample)
+
+
+def prepare_sample(samples, cuda_enabled=True):
+    if cuda_enabled:
+        samples = move_to_cuda(samples)
+
+    # TODO fp16 support
+
+    return samples
+
+
+def reorg_datasets_by_split(datasets):
+    """
+    Organizes datasets by split.
+
+    Args:
+        datasets: dict of torch.utils.data.Dataset objects by name.
+
+    Returns:
+        Dict of datasets by split {split_name: List[Datasets]}.
+    """
+    # if len(datasets) == 1:
+    #     return datasets[list(datasets.keys())[0]]
+    # else:
+    reorg_datasets = dict()
+
+    # reorganize by split
+    for _, dataset in datasets.items():
+        for split_name, dataset_split in dataset.items():
+            if split_name not in reorg_datasets:
+                reorg_datasets[split_name] = [dataset_split]
+            else:
+                reorg_datasets[split_name].append(dataset_split)
+
+    return reorg_datasets
+
+
+def concat_datasets(datasets):
+    """
+    Concatenates multiple datasets into a single dataset.
+
+    It supports may-style datasets and DataPipeline from WebDataset. Currently, does not support
+    generic IterableDataset because it requires creating separate samplers.
+
+    Now only supports conctenating training datasets and assuming validation and testing
+    have only a single dataset. This is because metrics should not be computed on the concatenated
+    datasets.
+
+    Args:
+        datasets: dict of torch.utils.data.Dataset objects by split.
+
+    Returns:
+        Dict of concatenated datasets by split, "train" is the concatenation of multiple datasets,
+        "val" and "test" remain the same.
+
+        If the input training datasets contain both map-style and DataPipeline datasets, returns
+        a tuple, where the first element is a concatenated map-style dataset and the second
+        element is a chained DataPipeline dataset.
+
+    """
+    # concatenate datasets in the same split
+    for split_name in datasets:
+        if split_name != "train":
+            assert (
+                len(datasets[split_name]) == 1
+            ), "Do not support multiple {} datasets.".format(split_name)
+            datasets[split_name] = datasets[split_name][0]
+        else:
+            iterable_datasets, map_datasets = [], []
+            for dataset in datasets[split_name]:
+                if isinstance(dataset, wds.DataPipeline):
+                    logging.info(
+                        "Dataset {} is IterableDataset, can't be concatenated.".format(
+                            dataset
+                        )
+                    )
+                    iterable_datasets.append(dataset)
+                elif isinstance(dataset, IterableDataset):
+                    raise NotImplementedError(
+                        "Do not support concatenation of generic IterableDataset."
+                    )
+                else:
+                    map_datasets.append(dataset)
+
+            # if len(iterable_datasets) > 0:
+            # concatenate map-style datasets and iterable-style datasets separately
+            if len(iterable_datasets) > 1:
+                chained_datasets = (
+                    ChainDataset(iterable_datasets)
+                )
+            elif len(iterable_datasets) == 1:
+                chained_datasets = iterable_datasets[0]
+            else:
+                chained_datasets = None
+
+            concat_datasets = (
+                ConcatDataset(map_datasets) if len(map_datasets) > 0 else None
+            )
+
+            train_datasets = concat_datasets, chained_datasets
+            train_datasets = tuple([x for x in train_datasets if x is not None])
+            train_datasets = (
+                train_datasets[0] if len(train_datasets) == 1 else train_datasets
+            )
+
+            datasets[split_name] = train_datasets
+
+    return datasets
+
diff --git a/models/MiniGPT4/minigpt4/datasets/datasets/__init__.py b/models/MiniGPT4/minigpt4/datasets/datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/__init__.cpython-310.pyc b/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..b4b5e92
Binary files /dev/null and b/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/base_dataset.cpython-310.pyc b/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/base_dataset.cpython-310.pyc
new file mode 100644
index 0000000..eb1bdb6
Binary files /dev/null and b/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/base_dataset.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/caption_datasets.cpython-310.pyc b/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/caption_datasets.cpython-310.pyc
new file mode 100644
index 0000000..c2fac33
Binary files /dev/null and b/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/caption_datasets.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/cc_sbu_dataset.cpython-310.pyc b/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/cc_sbu_dataset.cpython-310.pyc
new file mode 100644
index 0000000..ca8d2df
Binary files /dev/null and b/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/cc_sbu_dataset.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/dataloader_utils.cpython-310.pyc b/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/dataloader_utils.cpython-310.pyc
new file mode 100644
index 0000000..c8e2845
Binary files /dev/null and b/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/dataloader_utils.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/laion_dataset.cpython-310.pyc b/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/laion_dataset.cpython-310.pyc
new file mode 100644
index 0000000..c6a4757
Binary files /dev/null and b/models/MiniGPT4/minigpt4/datasets/datasets/__pycache__/laion_dataset.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/datasets/datasets/base_dataset.py b/models/MiniGPT4/minigpt4/datasets/datasets/base_dataset.py
new file mode 100644
index 0000000..ae2a8d0
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/datasets/datasets/base_dataset.py
@@ -0,0 +1,68 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import json
+from typing import Iterable
+
+from torch.utils.data import Dataset, ConcatDataset
+from torch.utils.data.dataloader import default_collate
+
+
+class BaseDataset(Dataset):
+    def __init__(
+        self, vis_processor=None, text_processor=None, vis_root=None, ann_paths=[]
+    ):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = vis_root
+
+        self.annotation = []
+        for ann_path in ann_paths:
+            self.annotation.extend(json.load(open(ann_path, "r"))['annotations'])
+
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+        self._add_instance_ids()
+
+    def __len__(self):
+        return len(self.annotation)
+
+    def collater(self, samples):
+        return default_collate(samples)
+
+    def set_processors(self, vis_processor, text_processor):
+        self.vis_processor = vis_processor
+        self.text_processor = text_processor
+
+    def _add_instance_ids(self, key="instance_id"):
+        for idx, ann in enumerate(self.annotation):
+            ann[key] = str(idx)
+
+
+class ConcatDataset(ConcatDataset):
+    def __init__(self, datasets: Iterable[Dataset]) -> None:
+        super().__init__(datasets)
+
+    def collater(self, samples):
+        # TODO For now only supports datasets with same underlying collater implementations
+
+        all_keys = set()
+        for s in samples:
+            all_keys.update(s)
+
+        shared_keys = all_keys
+        for s in samples:
+            shared_keys = shared_keys & set(s.keys())
+
+        samples_shared_keys = []
+        for s in samples:
+            samples_shared_keys.append({k: s[k] for k in s.keys() if k in shared_keys})
+
+        return self.datasets[0].collater(samples_shared_keys)
diff --git a/models/MiniGPT4/minigpt4/datasets/datasets/caption_datasets.py b/models/MiniGPT4/minigpt4/datasets/datasets/caption_datasets.py
new file mode 100644
index 0000000..78bab66
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/datasets/datasets/caption_datasets.py
@@ -0,0 +1,85 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+from collections import OrderedDict
+
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from PIL import Image
+
+
+class __DisplMixin:
+    def displ_item(self, index):
+        sample, ann = self.__getitem__(index), self.annotation[index]
+
+        return OrderedDict(
+            {
+                "file": ann["image"],
+                "caption": ann["caption"],
+                "image": sample["image"],
+            }
+        )
+
+
+class CaptionDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+        self.img_ids = {}
+        n = 0
+        for ann in self.annotation:
+            img_id = ann["image_id"]
+            if img_id not in self.img_ids.keys():
+                self.img_ids[img_id] = n
+                n += 1
+
+    def __getitem__(self, index):
+
+        # TODO this assumes image input, not general enough
+        ann = self.annotation[index]
+
+        img_file = '{:0>12}.jpg'.format(ann["image_id"])
+        image_path = os.path.join(self.vis_root, img_file)
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        caption = self.text_processor(ann["caption"])
+
+        return {
+            "image": image,
+            "text_input": caption,
+            "image_id": self.img_ids[ann["image_id"]],
+        }
+
+
+class CaptionEvalDataset(BaseDataset, __DisplMixin):
+    def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        split (string): val or test
+        """
+        super().__init__(vis_processor, text_processor, vis_root, ann_paths)
+
+    def __getitem__(self, index):
+
+        ann = self.annotation[index]
+
+        image_path = os.path.join(self.vis_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+
+        return {
+            "image": image,
+            "image_id": ann["image_id"],
+            "instance_id": ann["instance_id"],
+        }
diff --git a/models/MiniGPT4/minigpt4/datasets/datasets/cc_sbu_dataset.py b/models/MiniGPT4/minigpt4/datasets/datasets/cc_sbu_dataset.py
new file mode 100644
index 0000000..f42bbce
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/datasets/datasets/cc_sbu_dataset.py
@@ -0,0 +1,47 @@
+import os
+from PIL import Image
+import webdataset as wds
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
+
+
+class CCSBUDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, location):
+        super().__init__(vis_processor=vis_processor, text_processor=text_processor)
+
+        self.inner_dataset = wds.DataPipeline(
+            wds.ResampledShards(location),
+            wds.tarfile_to_samples(handler=wds.warn_and_continue),
+            wds.shuffle(1000, handler=wds.warn_and_continue),
+            wds.decode("pilrgb", handler=wds.warn_and_continue),
+            wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
+            wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
+            wds.map(self.to_dict, handler=wds.warn_and_continue),
+        )
+
+    def to_dict(self, sample):
+        return {
+            "image": sample[0],
+            "text_input": self.text_processor(sample[1]["caption"]),
+        }
+
+
+class CCSBUAlignDataset(CaptionDataset):
+
+    def __getitem__(self, index):
+
+        # TODO this assumes image input, not general enough
+        ann = self.annotation[index]
+
+        img_file = '{}.jpg'.format(ann["image_id"])
+        image_path = os.path.join(self.vis_root, img_file)
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        caption = ann["caption"]
+
+        return {
+            "image": image,
+            "text_input": caption,
+            "image_id": self.img_ids[ann["image_id"]],
+        }
\ No newline at end of file
diff --git a/models/MiniGPT4/minigpt4/datasets/datasets/dataloader_utils.py b/models/MiniGPT4/minigpt4/datasets/datasets/dataloader_utils.py
new file mode 100644
index 0000000..8eaa3a5
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/datasets/datasets/dataloader_utils.py
@@ -0,0 +1,162 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import time
+import random
+import torch
+from minigpt4.datasets.data_utils import move_to_cuda
+from torch.utils.data import DataLoader
+
+
+class MultiIterLoader:
+    """
+    A simple wrapper for iterating over multiple iterators.
+
+    Args:
+        loaders (List[Loader]): List of Iterator loaders.
+        ratios (List[float]): List of ratios to sample from each loader. If None, all loaders are sampled uniformly.
+    """
+
+    def __init__(self, loaders, ratios=None):
+        # assert all loaders has __next__ method
+        for loader in loaders:
+            assert hasattr(
+                loader, "__next__"
+            ), "Loader {} has no __next__ method.".format(loader)
+
+        if ratios is None:
+            ratios = [1.0] * len(loaders)
+        else:
+            assert len(ratios) == len(loaders)
+            ratios = [float(ratio) / sum(ratios) for ratio in ratios]
+
+        self.loaders = loaders
+        self.ratios = ratios
+
+    def __next__(self):
+        # random sample from each loader by ratio
+        loader_idx = random.choices(range(len(self.loaders)), self.ratios, k=1)[0]
+        return next(self.loaders[loader_idx])
+
+
+class PrefetchLoader(object):
+    """
+    Modified from https://github.com/ChenRocks/UNITER.
+
+    overlap compute and cuda data transfer
+    (copied and then modified from nvidia apex)
+    """
+
+    def __init__(self, loader):
+        self.loader = loader
+        self.stream = torch.cuda.Stream()
+
+    def __iter__(self):
+        loader_it = iter(self.loader)
+        self.preload(loader_it)
+        batch = self.next(loader_it)
+        while batch is not None:
+            is_tuple = isinstance(batch, tuple)
+            if is_tuple:
+                task, batch = batch
+
+            if is_tuple:
+                yield task, batch
+            else:
+                yield batch
+            batch = self.next(loader_it)
+
+    def __len__(self):
+        return len(self.loader)
+
+    def preload(self, it):
+        try:
+            self.batch = next(it)
+        except StopIteration:
+            self.batch = None
+            return
+        # if record_stream() doesn't work, another option is to make sure
+        # device inputs are created on the main stream.
+        # self.next_input_gpu = torch.empty_like(self.next_input,
+        #                                        device='cuda')
+        # self.next_target_gpu = torch.empty_like(self.next_target,
+        #                                         device='cuda')
+        # Need to make sure the memory allocated for next_* is not still in use
+        # by the main stream at the time we start copying to next_*:
+        # self.stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(self.stream):
+            self.batch = move_to_cuda(self.batch)
+            # more code for the alternative if record_stream() doesn't work:
+            # copy_ will record the use of the pinned source tensor in this
+            # side stream.
+            # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
+            # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
+            # self.next_input = self.next_input_gpu
+            # self.next_target = self.next_target_gpu
+
+    def next(self, it):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        batch = self.batch
+        if batch is not None:
+            record_cuda_stream(batch)
+        self.preload(it)
+        return batch
+
+    def __getattr__(self, name):
+        method = self.loader.__getattribute__(name)
+        return method
+
+
+def record_cuda_stream(batch):
+    if isinstance(batch, torch.Tensor):
+        batch.record_stream(torch.cuda.current_stream())
+    elif isinstance(batch, list) or isinstance(batch, tuple):
+        for t in batch:
+            record_cuda_stream(t)
+    elif isinstance(batch, dict):
+        for t in batch.values():
+            record_cuda_stream(t)
+    else:
+        pass
+
+
+class IterLoader:
+    """
+    A wrapper to convert DataLoader as an infinite iterator.
+
+    Modified from:
+        https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/iter_based_runner.py
+    """
+
+    def __init__(self, dataloader: DataLoader, use_distributed: bool = False):
+        self._dataloader = dataloader
+        self.iter_loader = iter(self._dataloader)
+        self._use_distributed = use_distributed
+        self._epoch = 0
+
+    @property
+    def epoch(self) -> int:
+        return self._epoch
+
+    def __next__(self):
+        try:
+            data = next(self.iter_loader)
+        except StopIteration:
+            self._epoch += 1
+            if hasattr(self._dataloader.sampler, "set_epoch") and self._use_distributed:
+                self._dataloader.sampler.set_epoch(self._epoch)
+            time.sleep(2)  # Prevent possible deadlock during epoch transition
+            self.iter_loader = iter(self._dataloader)
+            data = next(self.iter_loader)
+
+        return data
+
+    def __iter__(self):
+        return self
+
+    def __len__(self):
+        return len(self._dataloader)
diff --git a/models/MiniGPT4/minigpt4/datasets/datasets/laion_dataset.py b/models/MiniGPT4/minigpt4/datasets/datasets/laion_dataset.py
new file mode 100644
index 0000000..1becbe4
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/datasets/datasets/laion_dataset.py
@@ -0,0 +1,31 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import webdataset as wds
+from minigpt4.datasets.datasets.base_dataset import BaseDataset
+
+
+class LaionDataset(BaseDataset):
+    def __init__(self, vis_processor, text_processor, location):
+        super().__init__(vis_processor=vis_processor, text_processor=text_processor)
+
+        self.inner_dataset = wds.DataPipeline(
+            wds.ResampledShards(location),
+            wds.tarfile_to_samples(handler=wds.warn_and_continue),
+            wds.shuffle(1000, handler=wds.warn_and_continue),
+            wds.decode("pilrgb", handler=wds.warn_and_continue),
+            wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
+            wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
+            wds.map(self.to_dict, handler=wds.warn_and_continue),
+        )
+
+    def to_dict(self, sample):
+        return {
+            "image": sample[0],
+            "text_input": self.text_processor(sample[1]["caption"]),
+        }
+
diff --git a/models/MiniGPT4/minigpt4/models/Qformer.py b/models/MiniGPT4/minigpt4/models/Qformer.py
new file mode 100644
index 0000000..e71b123
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/models/Qformer.py
@@ -0,0 +1,1216 @@
+"""
+ * Copyright (c) 2023, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+"""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Dict, Any
+
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+
+logger = logging.get_logger(__name__)
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
+        )
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+
+        if position_ids is None:
+            position_ids = self.position_ids[
+                :, past_key_values_length : seq_length + past_key_values_length
+            ].clone()
+
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids)
+                embeddings = embeddings + position_embeddings
+
+            if query_embeds is not None:
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size
+            )
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1
+            )
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype
+            )  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                relative_position_scores_key = torch.einsum(
+                    "bhrd,lrd->bhlr", key_layer, positional_embedding
+                )
+                attention_scores = (
+                    attention_scores
+                    + relative_position_scores_query
+                    + relative_position_scores_key
+                )
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.self.num_attention_heads,
+            self.self.attention_head_size,
+            self.pruned_heads,
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = (
+            self.self.attention_head_size * self.self.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.layer_num = layer_num
+        if (
+            self.config.add_cross_attention
+            and layer_num % self.config.cross_attention_freq == 0
+        ):
+            self.crossattention = BertAttention(
+                config, is_cross_attention=self.config.add_cross_attention
+            )
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+        self.intermediate_query = BertIntermediate(config)
+        self.output_query = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = (
+            past_key_value[:2] if past_key_value is not None else None
+        )
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                assert (
+                    encoder_hidden_states is not None
+                ), "encoder_hidden_states must be given for cross-attention layers"
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                outputs = (
+                    outputs + cross_attention_outputs[1:-1]
+                )  # add cross attentions if we output attention weights
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config, i) for i in range(config.num_hidden_layers)]
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+            () if output_attentions and self.config.add_cross_attention else None
+        )
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(
+                            *inputs, past_key_value, output_attentions, query_length
+                        )
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=False):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: Tensor,
+        input_shape: Tuple[int],
+        device: device,
+        is_decoder: bool,
+        has_query: bool = False,
+    ) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = (
+                    seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+                    <= seq_ids[None, :, None]
+                )
+
+                # add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    if has_query:  # UniLM style attention mask
+                        causal_mask = torch.cat(
+                            [
+                                torch.zeros(
+                                    (batch_size, prefix_seq_len, seq_length),
+                                    device=device,
+                                    dtype=causal_mask.dtype,
+                                ),
+                                causal_mask,
+                            ],
+                            axis=1,
+                        )
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, causal_mask.shape[1], prefix_seq_len),
+                                device=device,
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = (
+                    causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=self.dtype
+        )  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if input_ids is None:
+            assert (
+                query_embeds is not None
+            ), "You have to specify query_embeds when input_ids is None"
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length
+            if past_key_values is not None
+            else 0
+        )
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)), device=device
+            )
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if is_decoder:
+            extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask,
+                input_ids.shape,
+                device,
+                is_decoder,
+                has_query=(query_embeds is not None),
+            )
+        else:
+            extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask, input_shape, device, is_decoder
+            )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
+                    0
+                ].size()
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [
+                    self.invert_attention_mask(mask) for mask in encoder_attention_mask
+                ]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask
+                )
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = (
+            self.pooler(sequence_output) if self.pooler is not None else None
+        )
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction="mean",
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if labels is not None:
+            use_cache = False
+        if past_key_values is not None:
+            query_embeds = None
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        sequence_output = outputs[0]
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1),
+            )
+            if reduction == "none":
+                lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs
+    ):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+        query_mask = input_ids.new_ones(query_embeds.shape[:-1])
+        attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "query_embeds": query_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=False,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return (
+                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            )
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/models/MiniGPT4/minigpt4/models/__init__.py b/models/MiniGPT4/minigpt4/models/__init__.py
new file mode 100644
index 0000000..54acd24
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/models/__init__.py
@@ -0,0 +1,200 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import logging
+import torch
+from omegaconf import OmegaConf
+
+from minigpt4.common.registry import registry
+from minigpt4.models.base_model import BaseModel
+from minigpt4.models.blip2 import Blip2Base
+from minigpt4.models.mini_gpt4 import MiniGPT4
+from minigpt4.processors.base_processor import BaseProcessor
+
+
+__all__ = [
+    "load_model",
+    "BaseModel",
+    "Blip2Base",
+    "MiniGPT4",
+]
+
+
+def load_model(name, model_type, is_eval=False, device="cpu", checkpoint=None):
+    """
+    Load supported models.
+
+    To list all available models and types in registry:
+    >>> from minigpt4.models import model_zoo
+    >>> print(model_zoo)
+
+    Args:
+        name (str): name of the model.
+        model_type (str): type of the model.
+        is_eval (bool): whether the model is in eval mode. Default: False.
+        device (str): device to use. Default: "cpu".
+        checkpoint (str): path or to checkpoint. Default: None.
+            Note that expecting the checkpoint to have the same keys in state_dict as the model.
+
+    Returns:
+        model (torch.nn.Module): model.
+    """
+
+    model = registry.get_model_class(name).from_pretrained(model_type=model_type)
+
+    if checkpoint is not None:
+        model.load_checkpoint(checkpoint)
+
+    if is_eval:
+        model.eval()
+
+    if device == "cpu":
+        model = model.float()
+
+    return model.to(device)
+
+
+def load_preprocess(config):
+    """
+    Load preprocessor configs and construct preprocessors.
+
+    If no preprocessor is specified, return BaseProcessor, which does not do any preprocessing.
+
+    Args:
+        config (dict): preprocessor configs.
+
+    Returns:
+        vis_processors (dict): preprocessors for visual inputs.
+        txt_processors (dict): preprocessors for text inputs.
+
+        Key is "train" or "eval" for processors used in training and evaluation respectively.
+    """
+
+    def _build_proc_from_cfg(cfg):
+        return (
+            registry.get_processor_class(cfg.name).from_config(cfg)
+            if cfg is not None
+            else BaseProcessor()
+        )
+
+    vis_processors = dict()
+    txt_processors = dict()
+
+    vis_proc_cfg = config.get("vis_processor")
+    txt_proc_cfg = config.get("text_processor")
+
+    if vis_proc_cfg is not None:
+        vis_train_cfg = vis_proc_cfg.get("train")
+        vis_eval_cfg = vis_proc_cfg.get("eval")
+    else:
+        vis_train_cfg = None
+        vis_eval_cfg = None
+
+    vis_processors["train"] = _build_proc_from_cfg(vis_train_cfg)
+    vis_processors["eval"] = _build_proc_from_cfg(vis_eval_cfg)
+
+    if txt_proc_cfg is not None:
+        txt_train_cfg = txt_proc_cfg.get("train")
+        txt_eval_cfg = txt_proc_cfg.get("eval")
+    else:
+        txt_train_cfg = None
+        txt_eval_cfg = None
+
+    txt_processors["train"] = _build_proc_from_cfg(txt_train_cfg)
+    txt_processors["eval"] = _build_proc_from_cfg(txt_eval_cfg)
+
+    return vis_processors, txt_processors
+
+
+def load_model_and_preprocess(name, model_type, is_eval=False, device="cpu"):
+    """
+    Load model and its related preprocessors.
+
+    List all available models and types in registry:
+    >>> from minigpt4.models import model_zoo
+    >>> print(model_zoo)
+
+    Args:
+        name (str): name of the model.
+        model_type (str): type of the model.
+        is_eval (bool): whether the model is in eval mode. Default: False.
+        device (str): device to use. Default: "cpu".
+
+    Returns:
+        model (torch.nn.Module): model.
+        vis_processors (dict): preprocessors for visual inputs.
+        txt_processors (dict): preprocessors for text inputs.
+    """
+    model_cls = registry.get_model_class(name)
+
+    # load model
+    model = model_cls.from_pretrained(model_type=model_type)
+
+    if is_eval:
+        model.eval()
+
+    # load preprocess
+    cfg = OmegaConf.load(model_cls.default_config_path(model_type))
+    if cfg is not None:
+        preprocess_cfg = cfg.preprocess
+
+        vis_processors, txt_processors = load_preprocess(preprocess_cfg)
+    else:
+        vis_processors, txt_processors = None, None
+        logging.info(
+            f"""No default preprocess for model {name} ({model_type}).
+                This can happen if the model is not finetuned on downstream datasets,
+                or it is not intended for direct use without finetuning.
+            """
+        )
+
+    if device == "cpu" or device == torch.device("cpu"):
+        model = model.float()
+
+    return model.to(device), vis_processors, txt_processors
+
+
+class ModelZoo:
+    """
+    A utility class to create string representation of available model architectures and types.
+
+    >>> from minigpt4.models import model_zoo
+    >>> # list all available models
+    >>> print(model_zoo)
+    >>> # show total number of models
+    >>> print(len(model_zoo))
+    """
+
+    def __init__(self) -> None:
+        self.model_zoo = {
+            k: list(v.PRETRAINED_MODEL_CONFIG_DICT.keys())
+            for k, v in registry.mapping["model_name_mapping"].items()
+        }
+
+    def __str__(self) -> str:
+        return (
+            "=" * 50
+            + "\n"
+            + f"{'Architectures':<30} {'Types'}\n"
+            + "=" * 50
+            + "\n"
+            + "\n".join(
+                [
+                    f"{name:<30} {', '.join(types)}"
+                    for name, types in self.model_zoo.items()
+                ]
+            )
+        )
+
+    def __iter__(self):
+        return iter(self.model_zoo.items())
+
+    def __len__(self):
+        return sum([len(v) for v in self.model_zoo.values()])
+
+
+model_zoo = ModelZoo()
diff --git a/models/MiniGPT4/minigpt4/models/__pycache__/Qformer.cpython-310.pyc b/models/MiniGPT4/minigpt4/models/__pycache__/Qformer.cpython-310.pyc
new file mode 100644
index 0000000..b22113b
Binary files /dev/null and b/models/MiniGPT4/minigpt4/models/__pycache__/Qformer.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/models/__pycache__/__init__.cpython-310.pyc b/models/MiniGPT4/minigpt4/models/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..d1b8f92
Binary files /dev/null and b/models/MiniGPT4/minigpt4/models/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/models/__pycache__/base_model.cpython-310.pyc b/models/MiniGPT4/minigpt4/models/__pycache__/base_model.cpython-310.pyc
new file mode 100644
index 0000000..55d543d
Binary files /dev/null and b/models/MiniGPT4/minigpt4/models/__pycache__/base_model.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/models/__pycache__/blip2.cpython-310.pyc b/models/MiniGPT4/minigpt4/models/__pycache__/blip2.cpython-310.pyc
new file mode 100644
index 0000000..0d53e37
Binary files /dev/null and b/models/MiniGPT4/minigpt4/models/__pycache__/blip2.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/models/__pycache__/eva_vit.cpython-310.pyc b/models/MiniGPT4/minigpt4/models/__pycache__/eva_vit.cpython-310.pyc
new file mode 100644
index 0000000..5a79b0f
Binary files /dev/null and b/models/MiniGPT4/minigpt4/models/__pycache__/eva_vit.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/models/__pycache__/mini_gpt4.cpython-310.pyc b/models/MiniGPT4/minigpt4/models/__pycache__/mini_gpt4.cpython-310.pyc
new file mode 100644
index 0000000..46da01c
Binary files /dev/null and b/models/MiniGPT4/minigpt4/models/__pycache__/mini_gpt4.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/models/__pycache__/modeling_llama.cpython-310.pyc b/models/MiniGPT4/minigpt4/models/__pycache__/modeling_llama.cpython-310.pyc
new file mode 100644
index 0000000..b58f431
Binary files /dev/null and b/models/MiniGPT4/minigpt4/models/__pycache__/modeling_llama.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/models/base_model.py b/models/MiniGPT4/minigpt4/models/base_model.py
new file mode 100644
index 0000000..1cd2226
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/models/base_model.py
@@ -0,0 +1,247 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import logging
+import os
+
+import numpy as np
+import torch
+import torch.nn as nn
+from minigpt4.common.dist_utils import download_cached_file, is_dist_avail_and_initialized
+from minigpt4.common.utils import get_abs_path, is_url
+from omegaconf import OmegaConf
+
+
+class BaseModel(nn.Module):
+    """Base class for models."""
+
+    def __init__(self):
+        super().__init__()
+
+    @property
+    def device(self):
+        return list(self.parameters())[0].device
+
+    def load_checkpoint(self, url_or_filename):
+        """
+        Load from a finetuned checkpoint.
+
+        This should expect no mismatch in the model keys and the checkpoint keys.
+        """
+
+        if is_url(url_or_filename):
+            cached_file = download_cached_file(
+                url_or_filename, check_hash=False, progress=True
+            )
+            checkpoint = torch.load(cached_file, map_location="cpu")
+        elif os.path.isfile(url_or_filename):
+            checkpoint = torch.load(url_or_filename, map_location="cpu")
+        else:
+            raise RuntimeError("checkpoint url or path is invalid")
+
+        if "model" in checkpoint.keys():
+            state_dict = checkpoint["model"]
+        else:
+            state_dict = checkpoint
+
+        msg = self.load_state_dict(state_dict, strict=False)
+
+        logging.info("Missing keys {}".format(msg.missing_keys))
+        logging.info("load checkpoint from %s" % url_or_filename)
+
+        return msg
+
+    @classmethod
+    def from_pretrained(cls, model_type):
+        """
+        Build a pretrained model from default configuration file, specified by model_type.
+
+        Args:
+            - model_type (str): model type, specifying architecture and checkpoints.
+
+        Returns:
+            - model (nn.Module): pretrained or finetuned model, depending on the configuration.
+        """
+        model_cfg = OmegaConf.load(cls.default_config_path(model_type)).model
+        model = cls.from_config(model_cfg)
+
+        return model
+
+    @classmethod
+    def default_config_path(cls, model_type):
+        assert (
+            model_type in cls.PRETRAINED_MODEL_CONFIG_DICT
+        ), "Unknown model type {}".format(model_type)
+        return get_abs_path(cls.PRETRAINED_MODEL_CONFIG_DICT[model_type])
+
+    def load_checkpoint_from_config(self, cfg, **kwargs):
+        """
+        Load checkpoint as specified in the config file.
+
+        If load_finetuned is True, load the finetuned model; otherwise, load the pretrained model.
+        When loading the pretrained model, each task-specific architecture may define their
+        own load_from_pretrained() method.
+        """
+        load_finetuned = cfg.get("load_finetuned", True)
+        if load_finetuned:
+            finetune_path = cfg.get("finetuned", None)
+            assert (
+                finetune_path is not None
+            ), "Found load_finetuned is True, but finetune_path is None."
+            self.load_checkpoint(url_or_filename=finetune_path)
+        else:
+            # load pre-trained weights
+            pretrain_path = cfg.get("pretrained", None)
+            assert "Found load_finetuned is False, but pretrain_path is None."
+            self.load_from_pretrained(url_or_filename=pretrain_path, **kwargs)
+
+    def before_evaluation(self, **kwargs):
+        pass
+
+    def show_n_params(self, return_str=True):
+        tot = 0
+        for p in self.parameters():
+            w = 1
+            for x in p.shape:
+                w *= x
+            tot += w
+        if return_str:
+            if tot >= 1e6:
+                return "{:.1f}M".format(tot / 1e6)
+            else:
+                return "{:.1f}K".format(tot / 1e3)
+        else:
+            return tot
+
+
+class BaseEncoder(nn.Module):
+    """
+    Base class for primitive encoders, such as ViT, TimeSformer, etc.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward_features(self, samples, **kwargs):
+        raise NotImplementedError
+
+    @property
+    def device(self):
+        return list(self.parameters())[0].device
+
+
+class SharedQueueMixin:
+    @torch.no_grad()
+    def _dequeue_and_enqueue(self, image_feat, text_feat, idxs=None):
+        # gather keys before updating queue
+        image_feats = concat_all_gather(image_feat)
+        text_feats = concat_all_gather(text_feat)
+
+        batch_size = image_feats.shape[0]
+
+        ptr = int(self.queue_ptr)
+        assert self.queue_size % batch_size == 0  # for simplicity
+
+        # replace the keys at ptr (dequeue and enqueue)
+        self.image_queue[:, ptr : ptr + batch_size] = image_feats.T
+        self.text_queue[:, ptr : ptr + batch_size] = text_feats.T
+
+        if idxs is not None:
+            idxs = concat_all_gather(idxs)
+            self.idx_queue[:, ptr : ptr + batch_size] = idxs.T
+
+        ptr = (ptr + batch_size) % self.queue_size  # move pointer
+        self.queue_ptr[0] = ptr
+
+
+class MomentumDistilationMixin:
+    @torch.no_grad()
+    def copy_params(self):
+        for model_pair in self.model_pairs:
+            for param, param_m in zip(
+                model_pair[0].parameters(), model_pair[1].parameters()
+            ):
+                param_m.data.copy_(param.data)  # initialize
+                param_m.requires_grad = False  # not update by gradient
+
+    @torch.no_grad()
+    def _momentum_update(self):
+        for model_pair in self.model_pairs:
+            for param, param_m in zip(
+                model_pair[0].parameters(), model_pair[1].parameters()
+            ):
+                param_m.data = param_m.data * self.momentum + param.data * (
+                    1.0 - self.momentum
+                )
+
+
+class GatherLayer(torch.autograd.Function):
+    """
+    Gather tensors from all workers with support for backward propagation:
+    This implementation does not cut the gradients as torch.distributed.all_gather does.
+    """
+
+    @staticmethod
+    def forward(ctx, x):
+        output = [
+            torch.zeros_like(x) for _ in range(torch.distributed.get_world_size())
+        ]
+        torch.distributed.all_gather(output, x)
+        return tuple(output)
+
+    @staticmethod
+    def backward(ctx, *grads):
+        all_gradients = torch.stack(grads)
+        torch.distributed.all_reduce(all_gradients)
+        return all_gradients[torch.distributed.get_rank()]
+
+
+def all_gather_with_grad(tensors):
+    """
+    Performs all_gather operation on the provided tensors.
+    Graph remains connected for backward grad computation.
+    """
+    # Queue the gathered tensors
+    world_size = torch.distributed.get_world_size()
+    # There is no need for reduction in the single-proc case
+    if world_size == 1:
+        return tensors
+
+    # tensor_all = GatherLayer.apply(tensors)
+    tensor_all = GatherLayer.apply(tensors)
+
+    return torch.cat(tensor_all, dim=0)
+
+
+@torch.no_grad()
+def concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    # if use distributed training
+    if not is_dist_avail_and_initialized():
+        return tensor
+
+    tensors_gather = [
+        torch.ones_like(tensor) for _ in range(torch.distributed.get_world_size())
+    ]
+    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+
+    output = torch.cat(tensors_gather, dim=0)
+    return output
+
+
+def tile(x, dim, n_tile):
+    init_dim = x.size(dim)
+    repeat_idx = [1] * x.dim()
+    repeat_idx[dim] = n_tile
+    x = x.repeat(*(repeat_idx))
+    order_index = torch.LongTensor(
+        np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)])
+    )
+    return torch.index_select(x, dim, order_index.to(x.device))
diff --git a/models/MiniGPT4/minigpt4/models/blip2.py b/models/MiniGPT4/minigpt4/models/blip2.py
new file mode 100644
index 0000000..ee4a9dc
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/models/blip2.py
@@ -0,0 +1,221 @@
+"""
+ Copyright (c) 2023, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import contextlib
+import logging
+import os
+import time
+import datetime
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+import torch.nn.functional as F
+
+import minigpt4.common.dist_utils as dist_utils
+from minigpt4.common.dist_utils import download_cached_file
+from minigpt4.common.utils import is_url
+from minigpt4.common.logger import MetricLogger
+from minigpt4.models.base_model import BaseModel
+from minigpt4.models.Qformer import BertConfig, BertLMHeadModel
+from minigpt4.models.eva_vit import create_eva_vit_g
+from transformers import BertTokenizer
+
+
+class Blip2Base(BaseModel):
+    @classmethod
+    def init_tokenizer(cls):
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer.add_special_tokens({"bos_token": "[DEC]"})
+        return tokenizer
+
+    def maybe_autocast(self, dtype=torch.float16):
+        # if on cpu, don't use autocast
+        # if on gpu, use autocast with dtype if provided, otherwise use torch.float16
+        enable_autocast = self.device != torch.device("cpu")
+
+        if enable_autocast:
+            return torch.cuda.amp.autocast(dtype=dtype)
+        else:
+            return contextlib.nullcontext()
+
+    @classmethod
+    def init_Qformer(cls, num_query_token, vision_width, cross_attention_freq=2):
+        encoder_config = BertConfig.from_pretrained("bert-base-uncased")
+        encoder_config.encoder_width = vision_width
+        # insert cross-attention layer every other block
+        encoder_config.add_cross_attention = True
+        encoder_config.cross_attention_freq = cross_attention_freq
+        encoder_config.query_length = num_query_token
+        Qformer = BertLMHeadModel(config=encoder_config)
+        query_tokens = nn.Parameter(
+            torch.zeros(1, num_query_token, encoder_config.hidden_size)
+        )
+        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+        return Qformer, query_tokens
+
+    @classmethod
+    def init_vision_encoder(
+        cls, model_name, img_size, drop_path_rate, use_grad_checkpoint, precision
+    ):
+        assert model_name == "eva_clip_g", "vit model must be eva_clip_g for current version of MiniGPT-4"
+        visual_encoder = create_eva_vit_g(
+            img_size, drop_path_rate, use_grad_checkpoint, precision
+        )
+
+        ln_vision = LayerNorm(visual_encoder.num_features)
+        return visual_encoder, ln_vision
+
+    def load_from_pretrained(self, url_or_filename):
+        if is_url(url_or_filename):
+            cached_file = download_cached_file(
+                url_or_filename, check_hash=False, progress=True
+            )
+            checkpoint = torch.load(cached_file, map_location="cpu")
+        elif os.path.isfile(url_or_filename):
+            checkpoint = torch.load(url_or_filename, map_location="cpu")
+        else:
+            raise RuntimeError("checkpoint url or path is invalid")
+
+        state_dict = checkpoint["model"]
+
+        msg = self.load_state_dict(state_dict, strict=False)
+
+        # logging.info("Missing keys {}".format(msg.missing_keys))
+        logging.info("load checkpoint from %s" % url_or_filename)
+
+        return msg
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+def compute_sim_matrix(model, data_loader, **kwargs):
+    k_test = kwargs.pop("k_test")
+
+    metric_logger = MetricLogger(delimiter="  ")
+    header = "Evaluation:"
+
+    logging.info("Computing features for evaluation...")
+    start_time = time.time()
+
+    texts = data_loader.dataset.text
+    num_text = len(texts)
+    text_bs = 256
+    text_ids = []
+    text_embeds = []
+    text_atts = []
+    for i in range(0, num_text, text_bs):
+        text = texts[i : min(num_text, i + text_bs)]
+        text_input = model.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=35,
+            return_tensors="pt",
+        ).to(model.device)
+        text_feat = model.forward_text(text_input)
+        text_embed = F.normalize(model.text_proj(text_feat))
+        text_embeds.append(text_embed)
+        text_ids.append(text_input.input_ids)
+        text_atts.append(text_input.attention_mask)
+
+    text_embeds = torch.cat(text_embeds, dim=0)
+    text_ids = torch.cat(text_ids, dim=0)
+    text_atts = torch.cat(text_atts, dim=0)
+
+    vit_feats = []
+    image_embeds = []
+    for samples in data_loader:
+        image = samples["image"]
+
+        image = image.to(model.device)
+        image_feat, vit_feat = model.forward_image(image)
+        image_embed = model.vision_proj(image_feat)
+        image_embed = F.normalize(image_embed, dim=-1)
+
+        vit_feats.append(vit_feat.cpu())
+        image_embeds.append(image_embed)
+
+    vit_feats = torch.cat(vit_feats, dim=0)
+    image_embeds = torch.cat(image_embeds, dim=0)
+
+    sims_matrix = []
+    for image_embed in image_embeds:
+        sim_q2t = image_embed @ text_embeds.t()
+        sim_i2t, _ = sim_q2t.max(0)
+        sims_matrix.append(sim_i2t)
+    sims_matrix = torch.stack(sims_matrix, dim=0)
+
+    score_matrix_i2t = torch.full(
+        (len(data_loader.dataset.image), len(texts)), -100.0
+    ).to(model.device)
+
+    num_tasks = dist_utils.get_world_size()
+    rank = dist_utils.get_rank()
+    step = sims_matrix.size(0) // num_tasks + 1
+    start = rank * step
+    end = min(sims_matrix.size(0), start + step)
+
+    for i, sims in enumerate(
+        metric_logger.log_every(sims_matrix[start:end], 50, header)
+    ):
+        topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
+        image_inputs = vit_feats[start + i].repeat(k_test, 1, 1).to(model.device)
+        score = model.compute_itm(
+            image_inputs=image_inputs,
+            text_ids=text_ids[topk_idx],
+            text_atts=text_atts[topk_idx],
+        ).float()
+        score_matrix_i2t[start + i, topk_idx] = score + topk_sim
+
+    sims_matrix = sims_matrix.t()
+    score_matrix_t2i = torch.full(
+        (len(texts), len(data_loader.dataset.image)), -100.0
+    ).to(model.device)
+
+    step = sims_matrix.size(0) // num_tasks + 1
+    start = rank * step
+    end = min(sims_matrix.size(0), start + step)
+
+    for i, sims in enumerate(
+        metric_logger.log_every(sims_matrix[start:end], 50, header)
+    ):
+        topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
+        image_inputs = vit_feats[topk_idx.cpu()].to(model.device)
+        score = model.compute_itm(
+            image_inputs=image_inputs,
+            text_ids=text_ids[start + i].repeat(k_test, 1),
+            text_atts=text_atts[start + i].repeat(k_test, 1),
+        ).float()
+        score_matrix_t2i[start + i, topk_idx] = score + topk_sim
+
+    if dist_utils.is_dist_avail_and_initialized():
+        dist.barrier()
+        torch.distributed.all_reduce(
+            score_matrix_i2t, op=torch.distributed.ReduceOp.SUM
+        )
+        torch.distributed.all_reduce(
+            score_matrix_t2i, op=torch.distributed.ReduceOp.SUM
+        )
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    logging.info("Evaluation time {}".format(total_time_str))
+
+    return score_matrix_i2t.cpu().numpy(), score_matrix_t2i.cpu().numpy()
diff --git a/models/MiniGPT4/minigpt4/models/blip2_outputs.py b/models/MiniGPT4/minigpt4/models/blip2_outputs.py
new file mode 100644
index 0000000..e8722b1
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/models/blip2_outputs.py
@@ -0,0 +1,110 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+from transformers.modeling_outputs import (
+    ModelOutput,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+
+
+@dataclass
+class BlipSimilarity(ModelOutput):
+    sim_i2t: torch.FloatTensor = None
+    sim_t2i: torch.FloatTensor = None
+
+    sim_i2t_m: Optional[torch.FloatTensor] = None
+    sim_t2i_m: Optional[torch.FloatTensor] = None
+
+    sim_i2t_targets: Optional[torch.FloatTensor] = None
+    sim_t2i_targets: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class BlipIntermediateOutput(ModelOutput):
+    """
+    Data class for intermediate outputs of BLIP models.
+
+    image_embeds (torch.FloatTensor): Image embeddings, shape (batch_size, num_patches, embed_dim).
+    text_embeds (torch.FloatTensor): Text embeddings, shape (batch_size, seq_len, embed_dim).
+
+    image_embeds_m (torch.FloatTensor): Image embeddings from momentum visual encoder, shape (batch_size, num_patches, embed_dim).
+    text_embeds_m (torch.FloatTensor): Text embeddings from momentum text encoder, shape (batch_size, seq_len, embed_dim).
+
+    encoder_output (BaseModelOutputWithPoolingAndCrossAttentions): output from the image-grounded text encoder.
+    encoder_output_neg (BaseModelOutputWithPoolingAndCrossAttentions): output from the image-grounded text encoder for negative pairs.
+
+    decoder_output (CausalLMOutputWithCrossAttentions): output from the image-grounded text decoder.
+    decoder_labels (torch.LongTensor): labels for the captioning loss.
+
+    itm_logits (torch.FloatTensor): logits for the image-text matching loss, shape (batch_size * 3, 2).
+    itm_labels (torch.LongTensor): labels for the image-text matching loss, shape (batch_size * 3,)
+
+    """
+
+    # uni-modal features
+    image_embeds: torch.FloatTensor = None
+    text_embeds: Optional[torch.FloatTensor] = None
+
+    image_embeds_m: Optional[torch.FloatTensor] = None
+    text_embeds_m: Optional[torch.FloatTensor] = None
+
+    # intermediate outputs of multimodal encoder
+    encoder_output: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None
+    encoder_output_neg: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None
+
+    itm_logits: Optional[torch.FloatTensor] = None
+    itm_labels: Optional[torch.LongTensor] = None
+
+    # intermediate outputs of multimodal decoder
+    decoder_output: Optional[CausalLMOutputWithCrossAttentions] = None
+    decoder_labels: Optional[torch.LongTensor] = None
+
+
+@dataclass
+class BlipOutput(ModelOutput):
+    # some finetuned models (e.g. BlipVQA) do not compute similarity, thus optional.
+    sims: Optional[BlipSimilarity] = None
+
+    intermediate_output: BlipIntermediateOutput = None
+
+    loss: Optional[torch.FloatTensor] = None
+
+    loss_itc: Optional[torch.FloatTensor] = None
+
+    loss_itm: Optional[torch.FloatTensor] = None
+
+    loss_lm: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class BlipOutputFeatures(ModelOutput):
+    """
+    Data class of features from BlipFeatureExtractor.
+
+    Args:
+        image_embeds: (torch.FloatTensor) of shape (batch_size, num_patches+1, embed_dim), optional
+        image_features: (torch.FloatTensor) of shape (batch_size, num_patches+1, feature_dim), optional
+        text_embeds: (torch.FloatTensor) of shape (batch_size, sequence_length+1, embed_dim), optional
+        text_features: (torch.FloatTensor) of shape (batch_size, sequence_length+1, feature_dim), optional
+
+        The first embedding or feature is for the [CLS] token.
+
+        Features are obtained by projecting the corresponding embedding into a normalized low-dimensional space.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    image_embeds_proj: Optional[torch.FloatTensor] = None
+
+    text_embeds: Optional[torch.FloatTensor] = None
+    text_embeds_proj: Optional[torch.FloatTensor] = None
+
+    multimodal_embeds: Optional[torch.FloatTensor] = None
diff --git a/models/MiniGPT4/minigpt4/models/eva_vit.py b/models/MiniGPT4/minigpt4/models/eva_vit.py
new file mode 100644
index 0000000..7fcc63a
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/models/eva_vit.py
@@ -0,0 +1,442 @@
+# Based on EVA, BEIT, timm and DeiT code bases
+# https://github.com/baaivision/EVA
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# https://github.com/microsoft/unilm/tree/master/beit
+# https://github.com/facebookresearch/deit/
+# https://github.com/facebookresearch/dino
+# --------------------------------------------------------'
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+
+from minigpt4.common.dist_utils import download_cached_file
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        **kwargs
+    }
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement 
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index", relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+        
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if init_values is not None and init_values > 0:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias=None):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+class RelativePositionBias(nn.Module):
+
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+
+class VisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None,
+                 use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False,
+                 use_mean_pooling=True, init_scale=0.001, use_checkpoint=False):
+        super().__init__()
+        self.image_size = img_size
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+        self.use_checkpoint = use_checkpoint
+        
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None)
+            for i in range(depth)])
+#         self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
+#         self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+#         self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        # trunc_normal_(self.mask_token, std=.02)
+#         if isinstance(self.head, nn.Linear):
+#             trunc_normal_(self.head.weight, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+#         if isinstance(self.head, nn.Linear):
+#             self.head.weight.data.mul_(init_scale)
+#             self.head.bias.data.mul_(init_scale)
+
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, rel_pos_bias)
+            else:
+                x = blk(x, rel_pos_bias)
+        return x
+#         x = self.norm(x)
+
+#         if self.fc_norm is not None:
+#             t = x[:, 1:, :]
+#             return self.fc_norm(t.mean(1))
+#         else:
+#             return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+#         x = self.head(x)
+        return x
+
+    def get_intermediate_layers(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        features = []
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias)
+            features.append(x)
+
+        return features
+    
+    
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed'].float()
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
+            
+            
+def convert_weights_to_fp16(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+
+#         if isinstance(l, (nn.MultiheadAttention, Attention)):
+#             for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+#                 tensor = getattr(l, attr)
+#                 if tensor is not None:
+#                     tensor.data = tensor.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+    
+    
+def create_eva_vit_g(img_size=224,drop_path_rate=0.4,use_checkpoint=False,precision="fp16"):
+    model = VisionTransformer(
+        img_size=img_size,
+        patch_size=14,
+        use_mean_pooling=False,
+        embed_dim=1408,
+        depth=39,
+        num_heads=1408//88,
+        mlp_ratio=4.3637,
+        qkv_bias=True,
+        drop_path_rate=drop_path_rate,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        use_checkpoint=use_checkpoint,
+    )  
+    url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/eva_vit_g.pth"
+    cached_file = download_cached_file(
+        url, check_hash=False, progress=True
+    )
+    state_dict = torch.load(cached_file, map_location="cpu")    
+    interpolate_pos_embed(model,state_dict)
+    
+    incompatible_keys = model.load_state_dict(state_dict, strict=False)
+#     print(incompatible_keys)
+    
+    if precision == "fp16":
+#         model.to("cuda") 
+        convert_weights_to_fp16(model)
+    return model
\ No newline at end of file
diff --git a/models/MiniGPT4/minigpt4/models/mini_gpt4.py b/models/MiniGPT4/minigpt4/models/mini_gpt4.py
new file mode 100644
index 0000000..667edd5
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/models/mini_gpt4.py
@@ -0,0 +1,268 @@
+import logging
+import random
+
+import torch
+from torch.cuda.amp import autocast as autocast
+import torch.nn as nn
+
+from minigpt4.common.registry import registry
+from minigpt4.models.blip2 import Blip2Base, disabled_train
+from minigpt4.models.modeling_llama import LlamaForCausalLM
+from transformers import LlamaTokenizer
+
+
+@registry.register_model("mini_gpt4")
+class MiniGPT4(Blip2Base):
+    """
+    BLIP2 GPT-LLAMA model.
+    """
+
+    PRETRAINED_MODEL_CONFIG_DICT = {
+        "pretrain_vicuna": "configs/models/minigpt4.yaml",
+    }
+
+    def __init__(
+        self,
+        vit_model="eva_clip_g",
+        q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
+        img_size=224,
+        drop_path_rate=0,
+        use_grad_checkpoint=False,
+        vit_precision="fp16",
+        freeze_vit=True,
+        freeze_qformer=True,
+        num_query_token=32,
+        llama_model="",
+        prompt_path="",
+        prompt_template="",
+        max_txt_len=32,
+        end_sym='\n',
+        low_resource=False,  # use 8 bit and put vit in cpu
+        device_8bit=0,  # the device of 8bit model should be set when loading and cannot be changed anymore.
+    ):
+        super().__init__()
+
+        self.tokenizer = self.init_tokenizer()
+        self.low_resource = low_resource
+
+        print('Loading VIT')
+        self.visual_encoder, self.ln_vision = self.init_vision_encoder(
+            vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
+        )
+        if freeze_vit:
+            for name, param in self.visual_encoder.named_parameters():
+                param.requires_grad = False
+            self.visual_encoder = self.visual_encoder.eval()
+            self.visual_encoder.train = disabled_train
+            for name, param in self.ln_vision.named_parameters():
+                param.requires_grad = False
+            self.ln_vision = self.ln_vision.eval()
+            self.ln_vision.train = disabled_train
+            logging.info("freeze vision encoder")
+        print('Loading VIT Done')
+
+        print('Loading Q-Former')
+        self.Qformer, self.query_tokens = self.init_Qformer(
+            num_query_token, self.visual_encoder.num_features
+        )
+        self.Qformer.cls = None
+        self.Qformer.bert.embeddings.word_embeddings = None
+        self.Qformer.bert.embeddings.position_embeddings = None
+        for layer in self.Qformer.bert.encoder.layer:
+            layer.output = None
+            layer.intermediate = None
+        self.load_from_pretrained(url_or_filename=q_former_model)
+
+        if freeze_qformer:
+            for name, param in self.Qformer.named_parameters():
+                param.requires_grad = False
+            self.Qformer = self.Qformer.eval()
+            self.Qformer.train = disabled_train
+            self.query_tokens.requires_grad = False
+            logging.info("freeze Qformer")
+        print('Loading Q-Former Done')
+
+        print('Loading LLAMA')
+        self.llama_tokenizer = LlamaTokenizer.from_pretrained(llama_model, use_fast=False)
+        self.llama_tokenizer.pad_token = self.llama_tokenizer.eos_token
+
+        if self.low_resource:
+            self.llama_model = LlamaForCausalLM.from_pretrained(
+                llama_model,
+                torch_dtype=torch.float16,
+                load_in_8bit=True,
+                device_map={'': device_8bit}
+            )
+        else:
+            self.llama_model = LlamaForCausalLM.from_pretrained(
+                llama_model,
+                torch_dtype=torch.float16,
+            )
+
+        for name, param in self.llama_model.named_parameters():
+            param.requires_grad = False
+        print('Loading LLAMA Done')
+
+        self.llama_proj = nn.Linear(
+            self.Qformer.config.hidden_size, self.llama_model.config.hidden_size
+        )
+        self.max_txt_len = max_txt_len
+        self.end_sym = end_sym
+
+        if prompt_path:
+            with open(prompt_path, 'r') as f:
+                raw_prompts = f.read().splitlines()
+            filted_prompts = [raw_prompt for raw_prompt in raw_prompts if "<ImageHere>" in raw_prompt]
+            self.prompt_list = [prompt_template.format(p) for p in filted_prompts]
+            print('Load {} training prompts'.format(len(self.prompt_list)))
+            print('Prompt Example \n{}'.format(random.choice(self.prompt_list)))
+        else:
+            self.prompt_list = []
+
+    def vit_to_cpu(self):
+        self.ln_vision.to("cpu")
+        self.ln_vision.float()
+        self.visual_encoder.to("cpu")
+        self.visual_encoder.float()
+
+    def encode_img(self, image):
+        device = image.device
+        if self.low_resource:
+            self.vit_to_cpu()
+            image = image.to("cpu")
+
+        with self.maybe_autocast():
+            image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
+            image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
+
+            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+            query_output = self.Qformer.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=True,
+            )
+
+            inputs_llama = self.llama_proj(query_output.last_hidden_state)
+            atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
+        return inputs_llama, atts_llama
+
+    def prompt_wrap(self, img_embeds, atts_img, prompt):
+        if prompt:
+            batch_size = img_embeds.shape[0]
+            p_before, p_after = prompt.split('<ImageHere>')
+            p_before_tokens = self.llama_tokenizer(
+                p_before, return_tensors="pt", add_special_tokens=False).to(img_embeds.device)
+            p_after_tokens = self.llama_tokenizer(
+                p_after, return_tensors="pt", add_special_tokens=False).to(img_embeds.device)
+            p_before_embeds = self.llama_model.model.embed_tokens(p_before_tokens.input_ids).expand(batch_size, -1, -1)
+            p_after_embeds = self.llama_model.model.embed_tokens(p_after_tokens.input_ids).expand(batch_size, -1, -1)
+            wrapped_img_embeds = torch.cat([p_before_embeds, img_embeds, p_after_embeds], dim=1)
+            wrapped_atts_img = atts_img[:, :1].expand(-1, wrapped_img_embeds.shape[1])
+            return wrapped_img_embeds, wrapped_atts_img
+        else:
+            return img_embeds, atts_img
+
+    def forward(self, samples):
+        image = samples["image"]
+        img_embeds, atts_img = self.encode_img(image)
+        if hasattr(samples, 'question_split'):  # VQA dataset
+            print('VQA Batch')
+            vqa_prompt = '###Human: <Img><ImageHere></Img> '
+            img_embeds, atts_img = self.prompt_wrap(img_embeds, atts_img, vqa_prompt)
+        elif self.prompt_list:
+            prompt = random.choice(self.prompt_list)
+            img_embeds, atts_img = self.prompt_wrap(img_embeds, atts_img, prompt)
+
+        self.llama_tokenizer.padding_side = "right"
+
+        text = [t + self.end_sym for t in samples["text_input"]]
+
+        to_regress_tokens = self.llama_tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+            max_length=self.max_txt_len,
+            add_special_tokens=False
+        ).to(image.device)
+
+        targets = to_regress_tokens.input_ids.masked_fill(
+            to_regress_tokens.input_ids == self.llama_tokenizer.pad_token_id, -100
+        )
+
+        empty_targets = (
+            torch.ones([atts_img.shape[0], atts_img.shape[1]+1],
+                       dtype=torch.long).to(image.device).fill_(-100)  # plus one for bos
+        )
+        targets = torch.cat([empty_targets, targets], dim=1)
+
+        batch_size = img_embeds.shape[0]
+        bos = torch.ones([batch_size, 1],
+                         dtype=to_regress_tokens.input_ids.dtype,
+                         device=to_regress_tokens.input_ids.device) * self.llama_tokenizer.bos_token_id
+        bos_embeds = self.llama_model.model.embed_tokens(bos)
+        atts_bos = atts_img[:, :1]
+
+        to_regress_embeds = self.llama_model.model.embed_tokens(to_regress_tokens.input_ids)
+        inputs_embeds = torch.cat([bos_embeds, img_embeds, to_regress_embeds], dim=1)
+        attention_mask = torch.cat([atts_bos, atts_img, to_regress_tokens.attention_mask], dim=1)
+
+        with self.maybe_autocast():
+            outputs = self.llama_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                return_dict=True,
+                labels=targets,
+            )
+        loss = outputs.loss
+
+        return {"loss": loss}
+
+    @classmethod
+    def from_config(cls, cfg):
+        vit_model = cfg.get("vit_model", "eva_clip_g")
+        q_former_model = cfg.get("q_former_model", "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth")
+        img_size = cfg.get("image_size")
+        num_query_token = cfg.get("num_query_token")
+        llama_model = cfg.get("llama_model")
+
+        drop_path_rate = cfg.get("drop_path_rate", 0)
+        use_grad_checkpoint = cfg.get("use_grad_checkpoint", False)
+        vit_precision = cfg.get("vit_precision", "fp16")
+        freeze_vit = cfg.get("freeze_vit", True)
+        freeze_qformer = cfg.get("freeze_qformer", True)
+        low_resource = cfg.get("low_resource", False)
+        device_8bit = cfg.get("device_8bit", 0)
+
+        prompt_path = cfg.get("prompt_path", "")
+        prompt_template = cfg.get("prompt_template", "")
+        max_txt_len = cfg.get("max_txt_len", 32)
+        end_sym = cfg.get("end_sym", '\n')
+
+        model = cls(
+            vit_model=vit_model,
+            q_former_model=q_former_model,
+            img_size=img_size,
+            drop_path_rate=drop_path_rate,
+            use_grad_checkpoint=use_grad_checkpoint,
+            vit_precision=vit_precision,
+            freeze_vit=freeze_vit,
+            freeze_qformer=freeze_qformer,
+            num_query_token=num_query_token,
+            llama_model=llama_model,
+            prompt_path=prompt_path,
+            prompt_template=prompt_template,
+            max_txt_len=max_txt_len,
+            end_sym=end_sym,
+            low_resource=low_resource,
+            device_8bit=device_8bit,
+        )
+
+        ckpt_path = cfg.get("ckpt", "")  # load weights of MiniGPT-4
+        if ckpt_path:
+            print("Load BLIP2-LLM Checkpoint: {}".format(ckpt_path))
+            ckpt = torch.load(ckpt_path, map_location="cpu")
+            msg = model.load_state_dict(ckpt['model'], strict=False)
+
+        return model
diff --git a/models/MiniGPT4/minigpt4/models/modeling_llama.py b/models/MiniGPT4/minigpt4/models/modeling_llama.py
new file mode 100644
index 0000000..12d980e
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/models/modeling_llama.py
@@ -0,0 +1,755 @@
+# This script is based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+
+""" PyTorch LLaMA model."""
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from transformers.models.llama.configuration_llama import LlamaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "LlamaConfig"
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+class LlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+
+        # Build here to make `torch.jit.trace` work.
+        self.max_seq_len_cached = max_position_embeddings
+        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+            self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
+    gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
+    cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class LlamaMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config=config)
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlamaModel):
+            module.gradient_checkpointing = value
+
+
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        query_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if query_embeds is not None:
+            inputs_embeds = torch.cat([query_embeds, inputs_embeds], dim=1)
+            batch_size, seq_length, _ = inputs_embeds.shape
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        query_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            query_embeds=query_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, query_embeds=None, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+                query_embeds = None
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "query_embeds": query_embeds,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
diff --git a/models/MiniGPT4/minigpt4/processors/__init__.py b/models/MiniGPT4/minigpt4/processors/__init__.py
new file mode 100644
index 0000000..e560eaa
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/processors/__init__.py
@@ -0,0 +1,33 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from minigpt4.processors.base_processor import BaseProcessor
+from minigpt4.processors.blip_processors import (
+    Blip2ImageTrainProcessor,
+    Blip2ImageEvalProcessor,
+    BlipCaptionProcessor,
+)
+
+from minigpt4.common.registry import registry
+
+__all__ = [
+    "BaseProcessor",
+    "Blip2ImageTrainProcessor",
+    "Blip2ImageEvalProcessor",
+    "BlipCaptionProcessor",
+]
+
+
+def load_processor(name, cfg=None):
+    """
+    Example
+
+    >>> processor = load_processor("alpro_video_train", cfg=None)
+    """
+    processor = registry.get_processor_class(name).from_config(cfg)
+
+    return processor
diff --git a/models/MiniGPT4/minigpt4/processors/__pycache__/__init__.cpython-310.pyc b/models/MiniGPT4/minigpt4/processors/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..4fa4cdd
Binary files /dev/null and b/models/MiniGPT4/minigpt4/processors/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/processors/__pycache__/base_processor.cpython-310.pyc b/models/MiniGPT4/minigpt4/processors/__pycache__/base_processor.cpython-310.pyc
new file mode 100644
index 0000000..bbcc0e2
Binary files /dev/null and b/models/MiniGPT4/minigpt4/processors/__pycache__/base_processor.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/processors/__pycache__/blip_processors.cpython-310.pyc b/models/MiniGPT4/minigpt4/processors/__pycache__/blip_processors.cpython-310.pyc
new file mode 100644
index 0000000..96bd8f0
Binary files /dev/null and b/models/MiniGPT4/minigpt4/processors/__pycache__/blip_processors.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/processors/__pycache__/randaugment.cpython-310.pyc b/models/MiniGPT4/minigpt4/processors/__pycache__/randaugment.cpython-310.pyc
new file mode 100644
index 0000000..17f6017
Binary files /dev/null and b/models/MiniGPT4/minigpt4/processors/__pycache__/randaugment.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/processors/base_processor.py b/models/MiniGPT4/minigpt4/processors/base_processor.py
new file mode 100644
index 0000000..39b33cd
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/processors/base_processor.py
@@ -0,0 +1,26 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from omegaconf import OmegaConf
+
+
+class BaseProcessor:
+    def __init__(self):
+        self.transform = lambda x: x
+        return
+
+    def __call__(self, item):
+        return self.transform(item)
+
+    @classmethod
+    def from_config(cls, cfg=None):
+        return cls()
+
+    def build(self, **kwargs):
+        cfg = OmegaConf.create(kwargs)
+
+        return self.from_config(cfg)
diff --git a/models/MiniGPT4/minigpt4/processors/blip_processors.py b/models/MiniGPT4/minigpt4/processors/blip_processors.py
new file mode 100644
index 0000000..fd26160
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/processors/blip_processors.py
@@ -0,0 +1,141 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import re
+
+from minigpt4.common.registry import registry
+from minigpt4.processors.base_processor import BaseProcessor
+from minigpt4.processors.randaugment import RandomAugment
+from omegaconf import OmegaConf
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
+
+class BlipImageBaseProcessor(BaseProcessor):
+    def __init__(self, mean=None, std=None):
+        if mean is None:
+            mean = (0.48145466, 0.4578275, 0.40821073)
+        if std is None:
+            std = (0.26862954, 0.26130258, 0.27577711)
+
+        self.normalize = transforms.Normalize(mean, std)
+
+
+@registry.register_processor("blip_caption")
+class BlipCaptionProcessor(BaseProcessor):
+    def __init__(self, prompt="", max_words=50):
+        self.prompt = prompt
+        self.max_words = max_words
+
+    def __call__(self, caption):
+        caption = self.prompt + self.pre_caption(caption)
+
+        return caption
+
+    @classmethod
+    def from_config(cls, cfg=None):
+        if cfg is None:
+            cfg = OmegaConf.create()
+
+        prompt = cfg.get("prompt", "")
+        max_words = cfg.get("max_words", 50)
+
+        return cls(prompt=prompt, max_words=max_words)
+
+    def pre_caption(self, caption):
+        caption = re.sub(
+            r"([.!\"()*#:;~])",
+            " ",
+            caption.lower(),
+        )
+        caption = re.sub(
+            r"\s{2,}",
+            " ",
+            caption,
+        )
+        caption = caption.rstrip("\n")
+        caption = caption.strip(" ")
+
+        # truncate caption
+        caption_words = caption.split(" ")
+        if len(caption_words) > self.max_words:
+            caption = " ".join(caption_words[: self.max_words])
+
+        return caption
+
+
+@registry.register_processor("blip2_image_train")
+class Blip2ImageTrainProcessor(BlipImageBaseProcessor):
+    def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0):
+        super().__init__(mean=mean, std=std)
+
+        self.transform = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    image_size,
+                    scale=(min_scale, max_scale),
+                    interpolation=InterpolationMode.BICUBIC,
+                ),
+                transforms.ToTensor(),
+                self.normalize,
+            ]
+        )
+
+    def __call__(self, item):
+        return self.transform(item)
+
+    @classmethod
+    def from_config(cls, cfg=None):
+        if cfg is None:
+            cfg = OmegaConf.create()
+
+        image_size = cfg.get("image_size", 224)
+
+        mean = cfg.get("mean", None)
+        std = cfg.get("std", None)
+
+        min_scale = cfg.get("min_scale", 0.5)
+        max_scale = cfg.get("max_scale", 1.0)
+
+        return cls(
+            image_size=image_size,
+            mean=mean,
+            std=std,
+            min_scale=min_scale,
+            max_scale=max_scale,
+        )
+
+
+@registry.register_processor("blip2_image_eval")
+class Blip2ImageEvalProcessor(BlipImageBaseProcessor):
+    def __init__(self, image_size=224, mean=None, std=None):
+        super().__init__(mean=mean, std=std)
+
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize(
+                    (image_size, image_size), interpolation=InterpolationMode.BICUBIC
+                ),
+                transforms.ToTensor(),
+                self.normalize,
+            ]
+        )
+
+    def __call__(self, item):
+        return self.transform(item)
+
+    @classmethod
+    def from_config(cls, cfg=None):
+        if cfg is None:
+            cfg = OmegaConf.create()
+
+        image_size = cfg.get("image_size", 224)
+
+        mean = cfg.get("mean", None)
+        std = cfg.get("std", None)
+
+        return cls(image_size=image_size, mean=mean, std=std)
\ No newline at end of file
diff --git a/models/MiniGPT4/minigpt4/processors/randaugment.py b/models/MiniGPT4/minigpt4/processors/randaugment.py
new file mode 100644
index 0000000..7034a49
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/processors/randaugment.py
@@ -0,0 +1,398 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import cv2
+import numpy as np
+
+import torch
+
+
+## aug functions
+def identity_func(img):
+    return img
+
+
+def autocontrast_func(img, cutoff=0):
+    """
+    same output as PIL.ImageOps.autocontrast
+    """
+    n_bins = 256
+
+    def tune_channel(ch):
+        n = ch.size
+        cut = cutoff * n // 100
+        if cut == 0:
+            high, low = ch.max(), ch.min()
+        else:
+            hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+            low = np.argwhere(np.cumsum(hist) > cut)
+            low = 0 if low.shape[0] == 0 else low[0]
+            high = np.argwhere(np.cumsum(hist[::-1]) > cut)
+            high = n_bins - 1 if high.shape[0] == 0 else n_bins - 1 - high[0]
+        if high <= low:
+            table = np.arange(n_bins)
+        else:
+            scale = (n_bins - 1) / (high - low)
+            offset = -low * scale
+            table = np.arange(n_bins) * scale + offset
+            table[table < 0] = 0
+            table[table > n_bins - 1] = n_bins - 1
+        table = table.clip(0, 255).astype(np.uint8)
+        return table[ch]
+
+    channels = [tune_channel(ch) for ch in cv2.split(img)]
+    out = cv2.merge(channels)
+    return out
+
+
+def equalize_func(img):
+    """
+    same output as PIL.ImageOps.equalize
+    PIL's implementation is different from cv2.equalize
+    """
+    n_bins = 256
+
+    def tune_channel(ch):
+        hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+        non_zero_hist = hist[hist != 0].reshape(-1)
+        step = np.sum(non_zero_hist[:-1]) // (n_bins - 1)
+        if step == 0:
+            return ch
+        n = np.empty_like(hist)
+        n[0] = step // 2
+        n[1:] = hist[:-1]
+        table = (np.cumsum(n) // step).clip(0, 255).astype(np.uint8)
+        return table[ch]
+
+    channels = [tune_channel(ch) for ch in cv2.split(img)]
+    out = cv2.merge(channels)
+    return out
+
+
+def rotate_func(img, degree, fill=(0, 0, 0)):
+    """
+    like PIL, rotate by degree, not radians
+    """
+    H, W = img.shape[0], img.shape[1]
+    center = W / 2, H / 2
+    M = cv2.getRotationMatrix2D(center, degree, 1)
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill)
+    return out
+
+
+def solarize_func(img, thresh=128):
+    """
+    same output as PIL.ImageOps.posterize
+    """
+    table = np.array([el if el < thresh else 255 - el for el in range(256)])
+    table = table.clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+
+
+def color_func(img, factor):
+    """
+    same output as PIL.ImageEnhance.Color
+    """
+    ## implementation according to PIL definition, quite slow
+    #  degenerate = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)[:, :, np.newaxis]
+    #  out = blend(degenerate, img, factor)
+    #  M = (
+    #      np.eye(3) * factor
+    #      + np.float32([0.114, 0.587, 0.299]).reshape(3, 1) * (1. - factor)
+    #  )[np.newaxis, np.newaxis, :]
+    M = np.float32(
+        [[0.886, -0.114, -0.114], [-0.587, 0.413, -0.587], [-0.299, -0.299, 0.701]]
+    ) * factor + np.float32([[0.114], [0.587], [0.299]])
+    out = np.matmul(img, M).clip(0, 255).astype(np.uint8)
+    return out
+
+
+def contrast_func(img, factor):
+    """
+    same output as PIL.ImageEnhance.Contrast
+    """
+    mean = np.sum(np.mean(img, axis=(0, 1)) * np.array([0.114, 0.587, 0.299]))
+    table = (
+        np.array([(el - mean) * factor + mean for el in range(256)])
+        .clip(0, 255)
+        .astype(np.uint8)
+    )
+    out = table[img]
+    return out
+
+
+def brightness_func(img, factor):
+    """
+    same output as PIL.ImageEnhance.Contrast
+    """
+    table = (np.arange(256, dtype=np.float32) * factor).clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+
+
+def sharpness_func(img, factor):
+    """
+    The differences the this result and PIL are all on the 4 boundaries, the center
+    areas are same
+    """
+    kernel = np.ones((3, 3), dtype=np.float32)
+    kernel[1][1] = 5
+    kernel /= 13
+    degenerate = cv2.filter2D(img, -1, kernel)
+    if factor == 0.0:
+        out = degenerate
+    elif factor == 1.0:
+        out = img
+    else:
+        out = img.astype(np.float32)
+        degenerate = degenerate.astype(np.float32)[1:-1, 1:-1, :]
+        out[1:-1, 1:-1, :] = degenerate + factor * (out[1:-1, 1:-1, :] - degenerate)
+        out = out.astype(np.uint8)
+    return out
+
+
+def shear_x_func(img, factor, fill=(0, 0, 0)):
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, factor, 0], [0, 1, 0]])
+    out = cv2.warpAffine(
+        img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR
+    ).astype(np.uint8)
+    return out
+
+
+def translate_x_func(img, offset, fill=(0, 0, 0)):
+    """
+    same output as PIL.Image.transform
+    """
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, -offset], [0, 1, 0]])
+    out = cv2.warpAffine(
+        img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR
+    ).astype(np.uint8)
+    return out
+
+
+def translate_y_func(img, offset, fill=(0, 0, 0)):
+    """
+    same output as PIL.Image.transform
+    """
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, 0], [0, 1, -offset]])
+    out = cv2.warpAffine(
+        img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR
+    ).astype(np.uint8)
+    return out
+
+
+def posterize_func(img, bits):
+    """
+    same output as PIL.ImageOps.posterize
+    """
+    out = np.bitwise_and(img, np.uint8(255 << (8 - bits)))
+    return out
+
+
+def shear_y_func(img, factor, fill=(0, 0, 0)):
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, 0], [factor, 1, 0]])
+    out = cv2.warpAffine(
+        img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR
+    ).astype(np.uint8)
+    return out
+
+
+def cutout_func(img, pad_size, replace=(0, 0, 0)):
+    replace = np.array(replace, dtype=np.uint8)
+    H, W = img.shape[0], img.shape[1]
+    rh, rw = np.random.random(2)
+    pad_size = pad_size // 2
+    ch, cw = int(rh * H), int(rw * W)
+    x1, x2 = max(ch - pad_size, 0), min(ch + pad_size, H)
+    y1, y2 = max(cw - pad_size, 0), min(cw + pad_size, W)
+    out = img.copy()
+    out[x1:x2, y1:y2, :] = replace
+    return out
+
+
+### level to args
+def enhance_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        return ((level / MAX_LEVEL) * 1.8 + 0.1,)
+
+    return level_to_args
+
+
+def shear_level_to_args(MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * 0.3
+        if np.random.random() > 0.5:
+            level = -level
+        return (level, replace_value)
+
+    return level_to_args
+
+
+def translate_level_to_args(translate_const, MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * float(translate_const)
+        if np.random.random() > 0.5:
+            level = -level
+        return (level, replace_value)
+
+    return level_to_args
+
+
+def cutout_level_to_args(cutout_const, MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * cutout_const)
+        return (level, replace_value)
+
+    return level_to_args
+
+
+def solarize_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * 256)
+        return (level,)
+
+    return level_to_args
+
+
+def none_level_to_args(level):
+    return ()
+
+
+def posterize_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * 4)
+        return (level,)
+
+    return level_to_args
+
+
+def rotate_level_to_args(MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * 30
+        if np.random.random() < 0.5:
+            level = -level
+        return (level, replace_value)
+
+    return level_to_args
+
+
+func_dict = {
+    "Identity": identity_func,
+    "AutoContrast": autocontrast_func,
+    "Equalize": equalize_func,
+    "Rotate": rotate_func,
+    "Solarize": solarize_func,
+    "Color": color_func,
+    "Contrast": contrast_func,
+    "Brightness": brightness_func,
+    "Sharpness": sharpness_func,
+    "ShearX": shear_x_func,
+    "TranslateX": translate_x_func,
+    "TranslateY": translate_y_func,
+    "Posterize": posterize_func,
+    "ShearY": shear_y_func,
+}
+
+translate_const = 10
+MAX_LEVEL = 10
+replace_value = (128, 128, 128)
+arg_dict = {
+    "Identity": none_level_to_args,
+    "AutoContrast": none_level_to_args,
+    "Equalize": none_level_to_args,
+    "Rotate": rotate_level_to_args(MAX_LEVEL, replace_value),
+    "Solarize": solarize_level_to_args(MAX_LEVEL),
+    "Color": enhance_level_to_args(MAX_LEVEL),
+    "Contrast": enhance_level_to_args(MAX_LEVEL),
+    "Brightness": enhance_level_to_args(MAX_LEVEL),
+    "Sharpness": enhance_level_to_args(MAX_LEVEL),
+    "ShearX": shear_level_to_args(MAX_LEVEL, replace_value),
+    "TranslateX": translate_level_to_args(translate_const, MAX_LEVEL, replace_value),
+    "TranslateY": translate_level_to_args(translate_const, MAX_LEVEL, replace_value),
+    "Posterize": posterize_level_to_args(MAX_LEVEL),
+    "ShearY": shear_level_to_args(MAX_LEVEL, replace_value),
+}
+
+
+class RandomAugment(object):
+    def __init__(self, N=2, M=10, isPIL=False, augs=[]):
+        self.N = N
+        self.M = M
+        self.isPIL = isPIL
+        if augs:
+            self.augs = augs
+        else:
+            self.augs = list(arg_dict.keys())
+
+    def get_random_ops(self):
+        sampled_ops = np.random.choice(self.augs, self.N)
+        return [(op, 0.5, self.M) for op in sampled_ops]
+
+    def __call__(self, img):
+        if self.isPIL:
+            img = np.array(img)
+        ops = self.get_random_ops()
+        for name, prob, level in ops:
+            if np.random.random() > prob:
+                continue
+            args = arg_dict[name](level)
+            img = func_dict[name](img, *args)
+        return img
+
+
+class VideoRandomAugment(object):
+    def __init__(self, N=2, M=10, p=0.0, tensor_in_tensor_out=True, augs=[]):
+        self.N = N
+        self.M = M
+        self.p = p
+        self.tensor_in_tensor_out = tensor_in_tensor_out
+        if augs:
+            self.augs = augs
+        else:
+            self.augs = list(arg_dict.keys())
+
+    def get_random_ops(self):
+        sampled_ops = np.random.choice(self.augs, self.N, replace=False)
+        return [(op, self.M) for op in sampled_ops]
+
+    def __call__(self, frames):
+        assert (
+            frames.shape[-1] == 3
+        ), "Expecting last dimension for 3-channels RGB (b, h, w, c)."
+
+        if self.tensor_in_tensor_out:
+            frames = frames.numpy().astype(np.uint8)
+
+        num_frames = frames.shape[0]
+
+        ops = num_frames * [self.get_random_ops()]
+        apply_or_not = num_frames * [np.random.random(size=self.N) > self.p]
+
+        frames = torch.stack(
+            list(map(self._aug, frames, ops, apply_or_not)), dim=0
+        ).float()
+
+        return frames
+
+    def _aug(self, img, ops, apply_or_not):
+        for i, (name, level) in enumerate(ops):
+            if not apply_or_not[i]:
+                continue
+            args = arg_dict[name](level)
+            img = func_dict[name](img, *args)
+        return torch.from_numpy(img)
+
+
+if __name__ == "__main__":
+    a = RandomAugment()
+    img = np.random.randn(32, 32, 3)
+    a(img)
diff --git a/models/MiniGPT4/minigpt4/runners/__init__.py b/models/MiniGPT4/minigpt4/runners/__init__.py
new file mode 100644
index 0000000..64e7a4d
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/runners/__init__.py
@@ -0,0 +1,10 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from minigpt4.runners.runner_base import RunnerBase
+
+__all__ = ["RunnerBase"]
diff --git a/models/MiniGPT4/minigpt4/runners/__pycache__/__init__.cpython-310.pyc b/models/MiniGPT4/minigpt4/runners/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..2fc7cff
Binary files /dev/null and b/models/MiniGPT4/minigpt4/runners/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/runners/__pycache__/runner_base.cpython-310.pyc b/models/MiniGPT4/minigpt4/runners/__pycache__/runner_base.cpython-310.pyc
new file mode 100644
index 0000000..504dafc
Binary files /dev/null and b/models/MiniGPT4/minigpt4/runners/__pycache__/runner_base.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/runners/runner_base.py b/models/MiniGPT4/minigpt4/runners/runner_base.py
new file mode 100644
index 0000000..ccb5706
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/runners/runner_base.py
@@ -0,0 +1,658 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import datetime
+import json
+import logging
+import os
+import time
+from pathlib import Path
+
+import torch
+import torch.distributed as dist
+import webdataset as wds
+from minigpt4.common.dist_utils import (
+    download_cached_file,
+    get_rank,
+    get_world_size,
+    is_main_process,
+    main_process,
+)
+from minigpt4.common.registry import registry
+from minigpt4.common.utils import is_url
+from minigpt4.datasets.data_utils import concat_datasets, reorg_datasets_by_split, ChainDataset
+from minigpt4.datasets.datasets.dataloader_utils import (
+    IterLoader,
+    MultiIterLoader,
+    PrefetchLoader,
+)
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader, DistributedSampler
+
+
+@registry.register_runner("runner_base")
+class RunnerBase:
+    """
+    A runner class to train and evaluate a model given a task and datasets.
+
+    The runner uses pytorch distributed data parallel by default. Future release
+    will support other distributed frameworks.
+    """
+
+    def __init__(self, cfg, task, model, datasets, job_id):
+        self.config = cfg
+        self.job_id = job_id
+
+        self.task = task
+        self.datasets = datasets
+
+        self._model = model
+
+        self._wrapped_model = None
+        self._device = None
+        self._optimizer = None
+        self._scaler = None
+        self._dataloaders = None
+        self._lr_sched = None
+
+        self.start_epoch = 0
+
+        # self.setup_seeds()
+        self.setup_output_dir()
+
+    @property
+    def device(self):
+        if self._device is None:
+            self._device = torch.device(self.config.run_cfg.device)
+
+        return self._device
+
+    @property
+    def use_distributed(self):
+        return self.config.run_cfg.distributed
+
+    @property
+    def model(self):
+        """
+        A property to get the DDP-wrapped model on the device.
+        """
+        # move model to device
+        if self._model.device != self.device:
+            self._model = self._model.to(self.device)
+
+            # distributed training wrapper
+            if self.use_distributed:
+                if self._wrapped_model is None:
+                    self._wrapped_model = DDP(
+                        self._model, device_ids=[self.config.run_cfg.gpu]
+                    )
+            else:
+                self._wrapped_model = self._model
+
+        return self._wrapped_model
+
+    @property
+    def optimizer(self):
+        # TODO make optimizer class and configurations
+        if self._optimizer is None:
+            num_parameters = 0
+            p_wd, p_non_wd = [], []
+            for n, p in self.model.named_parameters():
+                if not p.requires_grad:
+                    continue  # frozen weights
+                print(n)
+                if p.ndim < 2 or "bias" in n or "ln" in n or "bn" in n:
+                    p_non_wd.append(p)
+                else:
+                    p_wd.append(p)
+                num_parameters += p.data.nelement()
+            logging.info("number of trainable parameters: %d" % num_parameters)
+            optim_params = [
+                {
+                    "params": p_wd,
+                    "weight_decay": float(self.config.run_cfg.weight_decay),
+                },
+                {"params": p_non_wd, "weight_decay": 0},
+            ]
+            beta2 = self.config.run_cfg.get("beta2", 0.999)
+            self._optimizer = torch.optim.AdamW(
+                optim_params,
+                lr=float(self.config.run_cfg.init_lr),
+                weight_decay=float(self.config.run_cfg.weight_decay),
+                betas=(0.9, beta2),
+            )
+
+        return self._optimizer
+
+    @property
+    def scaler(self):
+        amp = self.config.run_cfg.get("amp", False)
+
+        if amp:
+            if self._scaler is None:
+                self._scaler = torch.cuda.amp.GradScaler()
+
+        return self._scaler
+
+    @property
+    def lr_scheduler(self):
+        """
+        A property to get and create learning rate scheduler by split just in need.
+        """
+        if self._lr_sched is None:
+            lr_sched_cls = registry.get_lr_scheduler_class(self.config.run_cfg.lr_sched)
+
+            # max_epoch = self.config.run_cfg.max_epoch
+            max_epoch = self.max_epoch
+            # min_lr = self.config.run_cfg.min_lr
+            min_lr = self.min_lr
+            # init_lr = self.config.run_cfg.init_lr
+            init_lr = self.init_lr
+
+            # optional parameters
+            decay_rate = self.config.run_cfg.get("lr_decay_rate", None)
+            warmup_start_lr = self.config.run_cfg.get("warmup_lr", -1)
+            warmup_steps = self.config.run_cfg.get("warmup_steps", 0)
+            iters_per_epoch = self.config.run_cfg.get("iters_per_epoch", None)
+
+            if iters_per_epoch is None:
+                try:
+                    iters_per_epoch = len(self.dataloaders['train'])
+                except (AttributeError, TypeError):
+                    iters_per_epoch = 10000
+
+            self._lr_sched = lr_sched_cls(
+                optimizer=self.optimizer,
+                max_epoch=max_epoch,
+                iters_per_epoch=iters_per_epoch,
+                min_lr=min_lr,
+                init_lr=init_lr,
+                decay_rate=decay_rate,
+                warmup_start_lr=warmup_start_lr,
+                warmup_steps=warmup_steps,
+            )
+
+        return self._lr_sched
+
+    @property
+    def dataloaders(self) -> dict:
+        """
+        A property to get and create dataloaders by split just in need.
+
+        If no train_dataset_ratio is provided, concatenate map-style datasets and
+        chain wds.DataPipe datasets separately. Training set becomes a tuple
+        (ConcatDataset, ChainDataset), both are optional but at least one of them is
+        required. The resultant ConcatDataset and ChainDataset will be sampled evenly.
+
+        If train_dataset_ratio is provided, create a MultiIterLoader to sample
+        each dataset by ratios during training.
+
+        Currently do not support multiple datasets for validation and test.
+
+        Returns:
+            dict: {split_name: (tuples of) dataloader}
+        """
+        if self._dataloaders is None:
+
+            # concatenate map-style datasets and chain wds.DataPipe datasets separately
+            # training set becomes a tuple (ConcatDataset, ChainDataset), both are
+            # optional but at least one of them is required. The resultant ConcatDataset
+            # and ChainDataset will be sampled evenly.
+            logging.info(
+                "dataset_ratios not specified, datasets will be concatenated (map-style datasets) or chained (webdataset.DataPipeline)."
+            )
+
+            datasets = reorg_datasets_by_split(self.datasets)
+            self.datasets = datasets
+            # self.datasets = concat_datasets(datasets)
+
+            # print dataset statistics after concatenation/chaining
+            for split_name in self.datasets:
+                if isinstance(self.datasets[split_name], tuple) or isinstance(
+                    self.datasets[split_name], list
+                ):
+                    # mixed wds.DataPipeline and torch.utils.data.Dataset
+                    num_records = sum(
+                        [
+                            len(d)
+                            if not type(d) in [wds.DataPipeline, ChainDataset]
+                            else 0
+                            for d in self.datasets[split_name]
+                        ]
+                    )
+
+                else:
+                    if hasattr(self.datasets[split_name], "__len__"):
+                        # a single map-style dataset
+                        num_records = len(self.datasets[split_name])
+                    else:
+                        # a single wds.DataPipeline
+                        num_records = -1
+                        logging.info(
+                            "Only a single wds.DataPipeline dataset, no __len__ attribute."
+                        )
+
+                if num_records >= 0:
+                    logging.info(
+                        "Loaded {} records for {} split from the dataset.".format(
+                            num_records, split_name
+                        )
+                    )
+
+            # create dataloaders
+            split_names = sorted(self.datasets.keys())
+
+            datasets = [self.datasets[split] for split in split_names]
+            is_trains = [split in self.train_splits for split in split_names]
+
+            batch_sizes = [
+                self.config.run_cfg.batch_size_train
+                if split == "train"
+                else self.config.run_cfg.batch_size_eval
+                for split in split_names
+            ]
+
+            collate_fns = []
+            for dataset in datasets:
+                if isinstance(dataset, tuple) or isinstance(dataset, list):
+                    collate_fns.append([getattr(d, "collater", None) for d in dataset])
+                else:
+                    collate_fns.append(getattr(dataset, "collater", None))
+
+            dataloaders = self.create_loaders(
+                datasets=datasets,
+                num_workers=self.config.run_cfg.num_workers,
+                batch_sizes=batch_sizes,
+                is_trains=is_trains,
+                collate_fns=collate_fns,
+            )
+
+            self._dataloaders = {k: v for k, v in zip(split_names, dataloaders)}
+
+        return self._dataloaders
+
+    @property
+    def cuda_enabled(self):
+        return self.device.type == "cuda"
+
+    @property
+    def max_epoch(self):
+        return int(self.config.run_cfg.max_epoch)
+
+    @property
+    def log_freq(self):
+        log_freq = self.config.run_cfg.get("log_freq", 50)
+        return int(log_freq)
+
+    @property
+    def init_lr(self):
+        return float(self.config.run_cfg.init_lr)
+
+    @property
+    def min_lr(self):
+        return float(self.config.run_cfg.min_lr)
+
+    @property
+    def accum_grad_iters(self):
+        return int(self.config.run_cfg.get("accum_grad_iters", 1))
+
+    @property
+    def valid_splits(self):
+        valid_splits = self.config.run_cfg.get("valid_splits", [])
+
+        if len(valid_splits) == 0:
+            logging.info("No validation splits found.")
+
+        return valid_splits
+
+    @property
+    def test_splits(self):
+        test_splits = self.config.run_cfg.get("test_splits", [])
+
+        return test_splits
+
+    @property
+    def train_splits(self):
+        train_splits = self.config.run_cfg.get("train_splits", [])
+
+        if len(train_splits) == 0:
+            logging.info("Empty train splits.")
+
+        return train_splits
+
+    @property
+    def evaluate_only(self):
+        """
+        Set to True to skip training.
+        """
+        return self.config.run_cfg.evaluate
+
+    @property
+    def use_dist_eval_sampler(self):
+        return self.config.run_cfg.get("use_dist_eval_sampler", True)
+
+    @property
+    def resume_ckpt_path(self):
+        return self.config.run_cfg.get("resume_ckpt_path", None)
+
+    @property
+    def train_loader(self):
+        train_dataloader = self.dataloaders["train"]
+
+        return train_dataloader
+
+    def setup_output_dir(self):
+        lib_root = Path(registry.get_path("library_root"))
+
+        output_dir = lib_root / self.config.run_cfg.output_dir / self.job_id
+        result_dir = output_dir / "result"
+
+        output_dir.mkdir(parents=True, exist_ok=True)
+        result_dir.mkdir(parents=True, exist_ok=True)
+
+        registry.register_path("result_dir", str(result_dir))
+        registry.register_path("output_dir", str(output_dir))
+
+        self.result_dir = result_dir
+        self.output_dir = output_dir
+
+    def train(self):
+        start_time = time.time()
+        best_agg_metric = 0
+        best_epoch = 0
+
+        self.log_config()
+
+        # resume from checkpoint if specified
+        if not self.evaluate_only and self.resume_ckpt_path is not None:
+            self._load_checkpoint(self.resume_ckpt_path)
+
+        for cur_epoch in range(self.start_epoch, self.max_epoch):
+            # training phase
+            if not self.evaluate_only:
+                logging.info("Start training")
+                train_stats = self.train_epoch(cur_epoch)
+                self.log_stats(split_name="train", stats=train_stats)
+
+            # evaluation phase
+            if len(self.valid_splits) > 0:
+                for split_name in self.valid_splits:
+                    logging.info("Evaluating on {}.".format(split_name))
+
+                    val_log = self.eval_epoch(
+                        split_name=split_name, cur_epoch=cur_epoch
+                    )
+                    if val_log is not None:
+                        if is_main_process():
+                            assert (
+                                "agg_metrics" in val_log
+                            ), "No agg_metrics found in validation log."
+
+                            agg_metrics = val_log["agg_metrics"]
+                            if agg_metrics > best_agg_metric and split_name == "val":
+                                best_epoch, best_agg_metric = cur_epoch, agg_metrics
+
+                                self._save_checkpoint(cur_epoch, is_best=True)
+
+                            val_log.update({"best_epoch": best_epoch})
+                            self.log_stats(val_log, split_name)
+
+            else:
+                # if no validation split is provided, we just save the checkpoint at the end of each epoch.
+                if not self.evaluate_only:
+                    self._save_checkpoint(cur_epoch, is_best=False)
+
+            if self.evaluate_only:
+                break
+
+            if self.config.run_cfg.distributed:
+                dist.barrier()
+
+        # testing phase
+        test_epoch = "best" if len(self.valid_splits) > 0 else cur_epoch
+        self.evaluate(cur_epoch=test_epoch, skip_reload=self.evaluate_only)
+
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        logging.info("Training time {}".format(total_time_str))
+
+    def evaluate(self, cur_epoch="best", skip_reload=False):
+        test_logs = dict()
+
+        if len(self.test_splits) > 0:
+            for split_name in self.test_splits:
+                test_logs[split_name] = self.eval_epoch(
+                    split_name=split_name, cur_epoch=cur_epoch, skip_reload=skip_reload
+                )
+
+            return test_logs
+
+    def train_epoch(self, epoch):
+        # train
+        self.model.train()
+
+        return self.task.train_epoch(
+            epoch=epoch,
+            model=self.model,
+            data_loader=self.train_loader,
+            optimizer=self.optimizer,
+            scaler=self.scaler,
+            lr_scheduler=self.lr_scheduler,
+            cuda_enabled=self.cuda_enabled,
+            log_freq=self.log_freq,
+            accum_grad_iters=self.accum_grad_iters,
+        )
+
+    @torch.no_grad()
+    def eval_epoch(self, split_name, cur_epoch, skip_reload=False):
+        """
+        Evaluate the model on a given split.
+
+        Args:
+            split_name (str): name of the split to evaluate on.
+            cur_epoch (int): current epoch.
+            skip_reload_best (bool): whether to skip reloading the best checkpoint.
+                During training, we will reload the best checkpoint for validation.
+                During testing, we will use provided weights and skip reloading the best checkpoint .
+        """
+        data_loader = self.dataloaders.get(split_name, None)
+        assert data_loader, "data_loader for split {} is None.".format(split_name)
+
+        # TODO In validation, you need to compute loss as well as metrics
+        # TODO consider moving to model.before_evaluation()
+        model = self.unwrap_dist_model(self.model)
+        if not skip_reload and cur_epoch == "best":
+            model = self._reload_best_model(model)
+        model.eval()
+
+        self.task.before_evaluation(
+            model=model,
+            dataset=self.datasets[split_name],
+        )
+        results = self.task.evaluation(model, data_loader)
+
+        if results is not None:
+            return self.task.after_evaluation(
+                val_result=results,
+                split_name=split_name,
+                epoch=cur_epoch,
+            )
+
+    def unwrap_dist_model(self, model):
+        if self.use_distributed:
+            return model.module
+        else:
+            return model
+
+    def create_loaders(
+        self,
+        datasets,
+        num_workers,
+        batch_sizes,
+        is_trains,
+        collate_fns,
+        dataset_ratios=None,
+    ):
+        """
+        Create dataloaders for training and validation.
+        """
+
+        def _create_loader(dataset, num_workers, bsz, is_train, collate_fn):
+            # create a single dataloader for each split
+            if isinstance(dataset, ChainDataset) or isinstance(
+                dataset, wds.DataPipeline
+            ):
+                # wds.WebdDataset instance are chained together
+                # webdataset.DataPipeline has its own sampler and collate_fn
+                loader = iter(
+                    DataLoader(
+                        dataset,
+                        batch_size=bsz,
+                        num_workers=num_workers,
+                        pin_memory=True,
+                    )
+                )
+            else:
+                # map-style dataset are concatenated together
+                # setup distributed sampler
+                if self.use_distributed:
+                    sampler = DistributedSampler(
+                        dataset,
+                        shuffle=is_train,
+                        num_replicas=get_world_size(),
+                        rank=get_rank(),
+                    )
+                    if not self.use_dist_eval_sampler:
+                        # e.g. retrieval evaluation
+                        sampler = sampler if is_train else None
+                else:
+                    sampler = None
+
+                loader = DataLoader(
+                    dataset,
+                    batch_size=bsz,
+                    num_workers=num_workers,
+                    pin_memory=True,
+                    sampler=sampler,
+                    shuffle=sampler is None and is_train,
+                    collate_fn=collate_fn,
+                    drop_last=True if is_train else False,
+                )
+                loader = PrefetchLoader(loader)
+
+                if is_train:
+                    loader = IterLoader(loader, use_distributed=self.use_distributed)
+
+            return loader
+
+        loaders = []
+
+        for dataset, bsz, is_train, collate_fn in zip(
+            datasets, batch_sizes, is_trains, collate_fns
+        ):
+            if isinstance(dataset, list) or isinstance(dataset, tuple):
+                if hasattr(dataset[0], 'sample_ratio') and dataset_ratios is None:
+                    dataset_ratios = [d.sample_ratio for d in dataset]
+                loader = MultiIterLoader(
+                    loaders=[
+                        _create_loader(d, num_workers, bsz, is_train, collate_fn[i])
+                        for i, d in enumerate(dataset)
+                    ],
+                    ratios=dataset_ratios,
+                )
+            else:
+                loader = _create_loader(dataset, num_workers, bsz, is_train, collate_fn)
+
+            loaders.append(loader)
+
+        return loaders
+
+    @main_process
+    def _save_checkpoint(self, cur_epoch, is_best=False):
+        """
+        Save the checkpoint at the current epoch.
+        """
+        model_no_ddp = self.unwrap_dist_model(self.model)
+        param_grad_dic = {
+            k: v.requires_grad for (k, v) in model_no_ddp.named_parameters()
+        }
+        state_dict = model_no_ddp.state_dict()
+        for k in list(state_dict.keys()):
+            if k in param_grad_dic.keys() and not param_grad_dic[k]:
+                # delete parameters that do not require gradient
+                del state_dict[k]
+        save_obj = {
+            "model": state_dict,
+            "optimizer": self.optimizer.state_dict(),
+            "config": self.config.to_dict(),
+            "scaler": self.scaler.state_dict() if self.scaler else None,
+            "epoch": cur_epoch,
+        }
+        save_to = os.path.join(
+            self.output_dir,
+            "checkpoint_{}.pth".format("best" if is_best else cur_epoch),
+        )
+        logging.info("Saving checkpoint at epoch {} to {}.".format(cur_epoch, save_to))
+        torch.save(save_obj, save_to)
+
+    def _reload_best_model(self, model):
+        """
+        Load the best checkpoint for evaluation.
+        """
+        checkpoint_path = os.path.join(self.output_dir, "checkpoint_best.pth")
+
+        logging.info("Loading checkpoint from {}.".format(checkpoint_path))
+        checkpoint = torch.load(checkpoint_path, map_location="cpu")
+        try:
+            model.load_state_dict(checkpoint["model"])
+        except RuntimeError as e:
+            logging.warning(
+                """
+                Key mismatch when loading checkpoint. This is expected if only part of the model is saved.
+                Trying to load the model with strict=False.
+                """
+            )
+            model.load_state_dict(checkpoint["model"], strict=False)
+        return model
+
+    def _load_checkpoint(self, url_or_filename):
+        """
+        Resume from a checkpoint.
+        """
+        if is_url(url_or_filename):
+            cached_file = download_cached_file(
+                url_or_filename, check_hash=False, progress=True
+            )
+            checkpoint = torch.load(cached_file, map_location=self.device)
+        elif os.path.isfile(url_or_filename):
+            checkpoint = torch.load(url_or_filename, map_location=self.device)
+        else:
+            raise RuntimeError("checkpoint url or path is invalid")
+
+        state_dict = checkpoint["model"]
+        self.unwrap_dist_model(self.model).load_state_dict(state_dict,strict=False)
+
+        self.optimizer.load_state_dict(checkpoint["optimizer"])
+        if self.scaler and "scaler" in checkpoint:
+            self.scaler.load_state_dict(checkpoint["scaler"])
+
+        self.start_epoch = checkpoint["epoch"] + 1
+        logging.info("Resume checkpoint from {}".format(url_or_filename))
+
+    @main_process
+    def log_stats(self, stats, split_name):
+        if isinstance(stats, dict):
+            log_stats = {**{f"{split_name}_{k}": v for k, v in stats.items()}}
+            with open(os.path.join(self.output_dir, "log.txt"), "a") as f:
+                f.write(json.dumps(log_stats) + "\n")
+        elif isinstance(stats, list):
+            pass
+
+    @main_process
+    def log_config(self):
+        with open(os.path.join(self.output_dir, "log.txt"), "a") as f:
+            f.write(json.dumps(self.config.to_dict(), indent=4) + "\n")
diff --git a/models/MiniGPT4/minigpt4/tasks/__init__.py b/models/MiniGPT4/minigpt4/tasks/__init__.py
new file mode 100644
index 0000000..ab1fb1c
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/tasks/__init__.py
@@ -0,0 +1,26 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from minigpt4.common.registry import registry
+from minigpt4.tasks.base_task import BaseTask
+from minigpt4.tasks.image_text_pretrain import ImageTextPretrainTask
+
+
+def setup_task(cfg):
+    assert "task" in cfg.run_cfg, "Task name must be provided."
+
+    task_name = cfg.run_cfg.task
+    task = registry.get_task_class(task_name).setup_task(cfg=cfg)
+    assert task is not None, "Task {} not properly registered.".format(task_name)
+
+    return task
+
+
+__all__ = [
+    "BaseTask",
+    "ImageTextPretrainTask",
+]
diff --git a/models/MiniGPT4/minigpt4/tasks/__pycache__/__init__.cpython-310.pyc b/models/MiniGPT4/minigpt4/tasks/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..437f7ce
Binary files /dev/null and b/models/MiniGPT4/minigpt4/tasks/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/tasks/__pycache__/base_task.cpython-310.pyc b/models/MiniGPT4/minigpt4/tasks/__pycache__/base_task.cpython-310.pyc
new file mode 100644
index 0000000..a4f7f03
Binary files /dev/null and b/models/MiniGPT4/minigpt4/tasks/__pycache__/base_task.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/tasks/__pycache__/image_text_pretrain.cpython-310.pyc b/models/MiniGPT4/minigpt4/tasks/__pycache__/image_text_pretrain.cpython-310.pyc
new file mode 100644
index 0000000..065a979
Binary files /dev/null and b/models/MiniGPT4/minigpt4/tasks/__pycache__/image_text_pretrain.cpython-310.pyc differ
diff --git a/models/MiniGPT4/minigpt4/tasks/base_task.py b/models/MiniGPT4/minigpt4/tasks/base_task.py
new file mode 100644
index 0000000..7ceee96
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/tasks/base_task.py
@@ -0,0 +1,286 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import logging
+import os
+
+import torch
+import torch.distributed as dist
+from minigpt4.common.dist_utils import get_rank, get_world_size, is_main_process, is_dist_avail_and_initialized
+from minigpt4.common.logger import MetricLogger, SmoothedValue
+from minigpt4.common.registry import registry
+from minigpt4.datasets.data_utils import prepare_sample
+
+
+class BaseTask:
+    def __init__(self, **kwargs):
+        super().__init__()
+
+        self.inst_id_key = "instance_id"
+
+    @classmethod
+    def setup_task(cls, **kwargs):
+        return cls()
+
+    def build_model(self, cfg):
+        model_config = cfg.model_cfg
+
+        model_cls = registry.get_model_class(model_config.arch)
+        return model_cls.from_config(model_config)
+
+    def build_datasets(self, cfg):
+        """
+        Build a dictionary of datasets, keyed by split 'train', 'valid', 'test'.
+        Download dataset and annotations automatically if not exist.
+
+        Args:
+            cfg (common.config.Config): _description_
+
+        Returns:
+            dict: Dictionary of torch.utils.data.Dataset objects by split.
+        """
+
+        datasets = dict()
+
+        datasets_config = cfg.datasets_cfg
+
+        assert len(datasets_config) > 0, "At least one dataset has to be specified."
+
+        for name in datasets_config:
+            dataset_config = datasets_config[name]
+
+            builder = registry.get_builder_class(name)(dataset_config)
+            dataset = builder.build_datasets()
+
+            dataset['train'].name = name
+            if 'sample_ratio' in dataset_config:
+                dataset['train'].sample_ratio = dataset_config.sample_ratio
+
+            datasets[name] = dataset
+
+        return datasets
+
+    def train_step(self, model, samples):
+        loss = model(samples)["loss"]
+        return loss
+
+    def valid_step(self, model, samples):
+        raise NotImplementedError
+
+    def before_evaluation(self, model, dataset, **kwargs):
+        model.before_evaluation(dataset=dataset, task_type=type(self))
+
+    def after_evaluation(self, **kwargs):
+        pass
+
+    def inference_step(self):
+        raise NotImplementedError
+
+    def evaluation(self, model, data_loader, cuda_enabled=True):
+        metric_logger = MetricLogger(delimiter="  ")
+        header = "Evaluation"
+        # TODO make it configurable
+        print_freq = 10
+
+        results = []
+
+        for samples in metric_logger.log_every(data_loader, print_freq, header):
+            samples = prepare_sample(samples, cuda_enabled=cuda_enabled)
+
+            eval_output = self.valid_step(model=model, samples=samples)
+            results.extend(eval_output)
+
+        if is_dist_avail_and_initialized():
+            dist.barrier()
+
+        return results
+
+    def train_epoch(
+        self,
+        epoch,
+        model,
+        data_loader,
+        optimizer,
+        lr_scheduler,
+        scaler=None,
+        cuda_enabled=False,
+        log_freq=50,
+        accum_grad_iters=1,
+    ):
+        return self._train_inner_loop(
+            epoch=epoch,
+            iters_per_epoch=lr_scheduler.iters_per_epoch,
+            model=model,
+            data_loader=data_loader,
+            optimizer=optimizer,
+            scaler=scaler,
+            lr_scheduler=lr_scheduler,
+            log_freq=log_freq,
+            cuda_enabled=cuda_enabled,
+            accum_grad_iters=accum_grad_iters,
+        )
+
+    def train_iters(
+        self,
+        epoch,
+        start_iters,
+        iters_per_inner_epoch,
+        model,
+        data_loader,
+        optimizer,
+        lr_scheduler,
+        scaler=None,
+        cuda_enabled=False,
+        log_freq=50,
+        accum_grad_iters=1,
+    ):
+        return self._train_inner_loop(
+            epoch=epoch,
+            start_iters=start_iters,
+            iters_per_epoch=iters_per_inner_epoch,
+            model=model,
+            data_loader=data_loader,
+            optimizer=optimizer,
+            scaler=scaler,
+            lr_scheduler=lr_scheduler,
+            log_freq=log_freq,
+            cuda_enabled=cuda_enabled,
+            accum_grad_iters=accum_grad_iters,
+        )
+
+    def _train_inner_loop(
+        self,
+        epoch,
+        iters_per_epoch,
+        model,
+        data_loader,
+        optimizer,
+        lr_scheduler,
+        scaler=None,
+        start_iters=None,
+        log_freq=50,
+        cuda_enabled=False,
+        accum_grad_iters=1,
+    ):
+        """
+        An inner training loop compatible with both epoch-based and iter-based training.
+
+        When using epoch-based, training stops after one epoch; when using iter-based,
+        training stops after #iters_per_epoch iterations.
+        """
+        use_amp = scaler is not None
+
+        if not hasattr(data_loader, "__next__"):
+            # convert to iterator if not already
+            data_loader = iter(data_loader)
+
+        metric_logger = MetricLogger(delimiter="  ")
+        metric_logger.add_meter("lr", SmoothedValue(window_size=1, fmt="{value:.6f}"))
+        metric_logger.add_meter("loss", SmoothedValue(window_size=1, fmt="{value:.4f}"))
+
+        # if iter-based runner, schedule lr based on inner epoch.
+        logging.info(
+            "Start training epoch {}, {} iters per inner epoch.".format(
+                epoch, iters_per_epoch
+            )
+        )
+        header = "Train: data epoch: [{}]".format(epoch)
+        if start_iters is None:
+            # epoch-based runner
+            inner_epoch = epoch
+        else:
+            # In iter-based runner, we schedule the learning rate based on iterations.
+            inner_epoch = start_iters // iters_per_epoch
+            header = header + "; inner epoch [{}]".format(inner_epoch)
+
+        for i in metric_logger.log_every(range(iters_per_epoch), log_freq, header):
+            # if using iter-based runner, we stop after iters_per_epoch iterations.
+            if i >= iters_per_epoch:
+                break
+
+            samples = next(data_loader)
+
+            samples = prepare_sample(samples, cuda_enabled=cuda_enabled)
+            samples.update(
+                {
+                    "epoch": inner_epoch,
+                    "num_iters_per_epoch": iters_per_epoch,
+                    "iters": i,
+                }
+            )
+
+            lr_scheduler.step(cur_epoch=inner_epoch, cur_step=i)
+
+            with torch.cuda.amp.autocast(enabled=use_amp):
+                loss = self.train_step(model=model, samples=samples)
+
+            # after_train_step()
+            if use_amp:
+                scaler.scale(loss).backward()
+            else:
+                loss.backward()
+
+            # update gradients every accum_grad_iters iterations
+            if (i + 1) % accum_grad_iters == 0:
+                if use_amp:
+                    scaler.step(optimizer)
+                    scaler.update()                     
+                else:    
+                    optimizer.step()
+                optimizer.zero_grad()
+
+            metric_logger.update(loss=loss.item())
+            metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+
+        # after train_epoch()
+        # gather the stats from all processes
+        metric_logger.synchronize_between_processes()
+        logging.info("Averaged stats: " + str(metric_logger.global_avg()))
+        return {
+            k: "{:.3f}".format(meter.global_avg)
+            for k, meter in metric_logger.meters.items()
+        }
+
+    @staticmethod
+    def save_result(result, result_dir, filename, remove_duplicate=""):
+        import json
+
+        result_file = os.path.join(
+            result_dir, "%s_rank%d.json" % (filename, get_rank())
+        )
+        final_result_file = os.path.join(result_dir, "%s.json" % filename)
+
+        json.dump(result, open(result_file, "w"))
+
+        if is_dist_avail_and_initialized():
+            dist.barrier()
+
+        if is_main_process():
+            logging.warning("rank %d starts merging results." % get_rank())
+            # combine results from all processes
+            result = []
+
+            for rank in range(get_world_size()):
+                result_file = os.path.join(
+                    result_dir, "%s_rank%d.json" % (filename, rank)
+                )
+                res = json.load(open(result_file, "r"))
+                result += res
+
+            if remove_duplicate:
+                result_new = []
+                id_list = []
+                for res in result:
+                    if res[remove_duplicate] not in id_list:
+                        id_list.append(res[remove_duplicate])
+                        result_new.append(res)
+                result = result_new
+
+            json.dump(result, open(final_result_file, "w"))
+            print("result file saved to %s" % final_result_file)
+
+        return final_result_file
diff --git a/models/MiniGPT4/minigpt4/tasks/image_text_pretrain.py b/models/MiniGPT4/minigpt4/tasks/image_text_pretrain.py
new file mode 100644
index 0000000..bbe8ec8
--- /dev/null
+++ b/models/MiniGPT4/minigpt4/tasks/image_text_pretrain.py
@@ -0,0 +1,18 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+from minigpt4.common.registry import registry
+from minigpt4.tasks.base_task import BaseTask
+
+
+@registry.register_task("image_text_pretrain")
+class ImageTextPretrainTask(BaseTask):
+    def __init__(self):
+        super().__init__()
+
+    def evaluation(self, model, data_loader, cuda_enabled=True):
+        pass
diff --git a/models/MiniGPT4/prompts/alignment.txt b/models/MiniGPT4/prompts/alignment.txt
new file mode 100644
index 0000000..38ae75a
--- /dev/null
+++ b/models/MiniGPT4/prompts/alignment.txt
@@ -0,0 +1,4 @@
+<Img><ImageHere></Img> Describe this image in detail.
+<Img><ImageHere></Img> Take a look at this image and describe what you notice.
+<Img><ImageHere></Img> Please provide a detailed description of the picture.
+<Img><ImageHere></Img> Could you describe the contents of this image for me?
\ No newline at end of file
diff --git a/models/MiniGPT4/train.py b/models/MiniGPT4/train.py
new file mode 100644
index 0000000..a90cb3f
--- /dev/null
+++ b/models/MiniGPT4/train.py
@@ -0,0 +1,103 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import argparse
+import os
+import random
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+
+import minigpt4.tasks as tasks
+from minigpt4.common.config import Config
+from minigpt4.common.dist_utils import get_rank, init_distributed_mode
+from minigpt4.common.logger import setup_logger
+from minigpt4.common.optims import (
+    LinearWarmupCosineLRScheduler,
+    LinearWarmupStepLRScheduler,
+)
+from minigpt4.common.registry import registry
+from minigpt4.common.utils import now
+
+# imports modules for registration
+from minigpt4.datasets.builders import *
+from minigpt4.models import *
+from minigpt4.processors import *
+from minigpt4.runners import *
+from minigpt4.tasks import *
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Training")
+
+    parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
+    parser.add_argument(
+        "--options",
+        nargs="+",
+        help="override some settings in the used config, the key-value pair "
+        "in xxx=yyy format will be merged into config file (deprecate), "
+        "change to --cfg-options instead.",
+    )
+
+    args = parser.parse_args()
+    # if 'LOCAL_RANK' not in os.environ:
+    #     os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def setup_seeds(config):
+    seed = config.run_cfg.seed + get_rank()
+
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+
+
+def get_runner_class(cfg):
+    """
+    Get runner class from config. Default to epoch-based runner.
+    """
+    runner_cls = registry.get_runner_class(cfg.run_cfg.get("runner", "runner_base"))
+
+    return runner_cls
+
+
+def main():
+    # allow auto-dl completes on main process without timeout when using NCCL backend.
+    # os.environ["NCCL_BLOCKING_WAIT"] = "1"
+
+    # set before init_distributed_mode() to ensure the same job_id shared across all ranks.
+    job_id = now()
+
+    cfg = Config(parse_args())
+
+    init_distributed_mode(cfg.run_cfg)
+
+    setup_seeds(cfg)
+
+    # set after init_distributed_mode() to only log on master.
+    setup_logger()
+
+    cfg.pretty_print()
+
+    task = tasks.setup_task(cfg)
+    datasets = task.build_datasets(cfg)
+    model = task.build_model(cfg)
+
+    runner = get_runner_class(cfg)(
+        cfg=cfg, job_id=job_id, task=task, model=model, datasets=datasets
+    )
+    runner.train()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/MiniGPT4/train_configs/minigpt4_stage1_pretrain.yaml b/models/MiniGPT4/train_configs/minigpt4_stage1_pretrain.yaml
new file mode 100644
index 0000000..044246c
--- /dev/null
+++ b/models/MiniGPT4/train_configs/minigpt4_stage1_pretrain.yaml
@@ -0,0 +1,57 @@
+model:
+  arch: mini_gpt4
+  model_type: pretrain_vicuna
+  freeze_vit: True
+  freeze_qformer: True
+
+
+datasets:
+  laion:
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 115
+  cc_sbu:
+    vis_processor:
+        train:
+          name: "blip2_image_train"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+    sample_ratio: 14
+
+
+run:
+  task: image_text_pretrain
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-4
+  min_lr: 8e-5
+  warmup_lr: 1e-6
+
+  weight_decay: 0.05
+  max_epoch: 4
+  batch_size_train: 64
+  batch_size_eval: 64
+  num_workers: 4
+  warmup_steps: 5000
+  iters_per_epoch: 5000
+
+  seed: 42
+  output_dir: "output/minigpt4_stage1_pretrain"
+
+  amp: True
+  resume_ckpt_path: null
+
+  evaluate: False 
+  train_splits: ["train"]
+
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
\ No newline at end of file
diff --git a/models/MiniGPT4/train_configs/minigpt4_stage2_finetune.yaml b/models/MiniGPT4/train_configs/minigpt4_stage2_finetune.yaml
new file mode 100644
index 0000000..1013bea
--- /dev/null
+++ b/models/MiniGPT4/train_configs/minigpt4_stage2_finetune.yaml
@@ -0,0 +1,51 @@
+model:
+  arch: mini_gpt4
+  model_type: pretrain_vicuna
+  freeze_vit: True
+  freeze_qformer: True
+  max_txt_len: 160
+  end_sym: "###"
+  prompt_path: "prompts/alignment.txt"
+  prompt_template: '###Human: {} ###Assistant: '
+  ckpt: '/path/to/stage1/checkpoint/'
+
+
+datasets:
+  cc_sbu_align:
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+
+run:
+  task: image_text_pretrain
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 3e-5
+  min_lr: 1e-5
+  warmup_lr: 1e-6
+
+  weight_decay: 0.05
+  max_epoch: 5
+  iters_per_epoch: 200
+  batch_size_train: 12
+  batch_size_eval: 12
+  num_workers: 4
+  warmup_steps: 200
+
+  seed: 42
+  output_dir: "output/minigpt4_stage2_finetune"
+
+  amp: True
+  resume_ckpt_path: null
+
+  evaluate: False 
+  train_splits: ["train"]
+
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
\ No newline at end of file
diff --git a/models/mPLUG_Owl b/models/mPLUG_Owl
deleted file mode 160000
index 6f695ac..0000000
--- a/models/mPLUG_Owl
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 6f695ac5fc6cd6d6c99763f298c2acb421e28564
diff --git a/models/mPLUG_Owl/LICENSE b/models/mPLUG_Owl/LICENSE
new file mode 100644
index 0000000..ccbb3c8
--- /dev/null
+++ b/models/mPLUG_Owl/LICENSE
@@ -0,0 +1,203 @@
+Copyright 2023 Alibaba X-PLUG Team. All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2023 Alibaba X-PLUG Team.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/models/mPLUG_Owl/OwlEval/OwlEval.md b/models/mPLUG_Owl/OwlEval/OwlEval.md
new file mode 100644
index 0000000..ca44851
--- /dev/null
+++ b/models/mPLUG_Owl/OwlEval/OwlEval.md
@@ -0,0 +1,60 @@
+# OwlEval
+
+We have compiled some examples and their corresponding questions from recent open-source work, and organized them into OwlEval.
+
+Following we will introduce the OwlEval and the data format in this document.
+
+## Data Format
+
+### questions
+
+`questions.jsonl` contains case images and information about their corresponding questions
+
+Each row contains the following field:
+
+- `image`: Indicates the name of the picture
+- `question_id`: Indicate the question id number, there are 82 questions
+
+- `question`: Represents specific problem information
+- `type`：Indicate whether the problem is a single-turn problem or a multi-turn problem
+
+For example:
+
+```json
+{"image": "1.jpg", "question_id": 1, "question": "What is funny about this image? Describe it panel by panel.", "type": ["single"]}
+```
+
+### answer
+
+This contains the responses of each model for each question, integrated into six jsonl:
+
+`llava_13b_answer.jsonl`
+
+`minigpt4_13b_answer.jsonl`
+
+`MMreact_answer.jsonl`
+
+`mPLUG_Owl_7b_answer.jsonl`
+
+`BLIP2_13b_answer.jsonl`
+
+`openflamingo_answer.jsonl`
+
+For each `answer/xxx.jsonl` it contains the following information:
+
+- `image`: Indicates the name of the picture
+- `question_id`: Indicate the question id number, there are 82 questions
+
+- `question`: Represents specific problem information
+- `answer`: Replie given by the model
+- `model_id`: The ID of the model the answer is generated by
+
+For example:
+
+```json
+{"image": "10.jpg", "question_id": 15, "question": "How many bedrooms are there in this floor plan?", "answer": "There are three bedrooms in this floor plan.", "model_id": "llava-13b"}
+```
+
+### cases
+
+This folder contains 50 evaluation pictures, where 21 from mini GPT-4, 13 from mm-react, 9 from blip-2, 3 from GPT-4 and 4 collected by us
\ No newline at end of file
diff --git a/models/mPLUG_Owl/OwlEval/answer/MMreact_answer.jsonl b/models/mPLUG_Owl/OwlEval/answer/MMreact_answer.jsonl
new file mode 100644
index 0000000..e14638b
--- /dev/null
+++ b/models/mPLUG_Owl/OwlEval/answer/MMreact_answer.jsonl
@@ -0,0 +1,82 @@
+{"image": "1.png", "question_id": 1, "question": "What is funny about this image? Describe it panel by panel.", "answer": "无法回溯", "model_id": "MMreact"}
+{"image": "2.png", "question_id": 2, "question": "Why would a person find this image funny?", "answer": "The humor in this image is subjective and \ncan vary from person to person. Some \npeople might find the cartoon squirrel \nholding a camera funny because it is a cute \nand quirky illustration. Others might find it \nfunny because it is a humorous take on the \nidea of a squirrel taking photos. \nUltimately, the humor in this image is up \nto the interpretation of the individual \nviewer. So, it depends on the person’s \nsense of humor and personal preferences.", "model_id": "MMreact"}
+{"image": "3.png", "question_id": 3, "question": "What is unusual about this image?", "answer": "The unusual thing about this image is that\na man is ironing clothes in the back of a\nyellow taxi. This is not a common sight and\nmay be considered unusual or unique.", "model_id": "MMreact"}
+{"image": "4.png", "question_id": 4, "question": "Is there text in the image? If so, can you read it and explain it to me? ", "answer": "The humor in the comic may come from the situation, the characters, or the dialogue. The person is feeling unwell and is asked if they are pregnant or have the coronavirus, which is a current and relevant topic. The humor may come from the absurdity of the situation or the unexpectedness of the question.\n", "model_id": "MMreact"}
+{"image": "4.png", "question_id": 5, "question": "What is funny about it?", "answer": "The humor in the comic may come from\nthe situation, the characters, or the\ndialogue. The person is feeling unwell and\nis asked if they are pregnant or have the\ncoronavirus, which is a current and\nrelevant topic. The humor may come\nfrom the absurdity of the situation or the\nunexpectedness of the question.\nHowever, humor is subjective and what\none person finds funny, another person\nmay not. It is best to view the comic and\ndecide for yourself if it is funny.", "model_id": "MMreact"}
+{"image": "5.png", "question_id": 6, "question": "Where is the frisbee in the image? ", "answer": "The frisbee is located at coordinates\n(63,119,133,177).\n", "model_id": "MMreact"}
+{"image": "5.png", "question_id": 7, "question": "Where is the man in the image?", "answer": "The man is located at the coordinates\n(66, 386) to (453, 604) in the image.\n", "model_id": "MMreact"}
+{"image": "5.png", "question_id": 8, "question": "What is the spatial relation between the frisbee and the man?", "answer": "The frisbee is located above the man in the image. The man is falling on the\nground while the frisbee is in the air. The\nfrisbee is also closer to the top of the\nimage than the man..", "model_id": "MMreact"}
+{"image": "6.png", "question_id": 9, "question": "What should I do after kneading the dough, before dividing it into pieces?", "answer": "After kneading the dough, you should\nput it in a warm place before dividing it\ninto pieces, according to the recipe in\nthe image.\n", "model_id": "MMreact"}
+{"image": "6.png", "question_id": 10, "question": "How long should I put in warm places?", "answer": "You should put the dough in a warm\nplace for 1 hour.", "model_id": "MMreact"}
+{"image": "7.png", "question_id": 11, "question": "What is the total amount?", "answer": "You spent $1299.90 on airfare and $43.83 on an Uber ride for a total of $1343.73 on travel.", "model_id": "MMreact"}
+{"image": "8.png", "question_id": 12, "question": "Excluding the regular cab, what is the average total fueling cost?", "answer": "The average total fueling cost excluding the Ford F150 Regular Cab is $76.55. This is calculated by taking the sum of the total fueling cost of Toyota RAV4 and Honda Civic and dividing it by 2. $82.56 + $70.55 / 2 = $76.55.", "model_id": "MMreact"}
+{"image": "9.png", "question_id": 13, "question": "Excluding Ford F150 Lightning, what is the average total charging cost? ", "answer": "This image is a chart of the total charging cost of different electric vehicles. The electric vehicles are Ford F150 Lightning Estimate,Rivian R1T/S,and Tesla Model 3 (Standard Range).\n", "model_id": "MMreact"}
+{"image": "9.png", "question_id": 14, "question": "Now how much cheaper is the average charging cost over the electric cars cluding Ford F150 Lightning) than the average fueling cost over the gasoline cars cluding Ford F150 gular Cab))?Let's think step-by-step.", "answer": "The total charging cost of Ford F150 Lightning(Estimate)is $22.76,the total charging cost of Rivian R1T/S is $31.35, and the total charging cost of Tesla Model 3 (Standard Range) is $12.54.\n", "model_id": "MMreact"}
+{"image": "10.png", "question_id": 15, "question": "How many bedrooms are there in this floor plan?", "answer": "There are two bedrooms in this floor plan.The first one is the second bedroom and the second one is the master bedroom.\n", "model_id": "MMreact"}
+{"image": "10.png", "question_id": 16, "question": "What is the dimension of Master Bedroom？", "answer": "The dimension of the Master Bedroom is 12'10\"x17'9\".\n", "model_id": "MMreact"}
+{"image": "10.png", "question_id": 17, "question": "Is there a bathroom in master bedroom?", "answer": "Yes, there is a bathroom in the master bedroom. The dimension of the master bathroom is 4'11\"x 6'10\".\n", "model_id": "MMreact"}
+{"image": "10.png", "question_id": 18, "question": "What is the dimension of Second Bedroom？", "answer": "The dimension of the Second Bedroom is 10'10\"×14'7\".\n", "model_id": "MMreact"}
+{"image": "10.png", "question_id": 19, "question": "Is the terrace bigger than the kitchen?", "answer": "The terrace is 21'6\"× 8'0\" and the kitchen is 7'5\"x 12'3\". Based on these dimensions, the terrace is larger than the kitchen.\n", "model_id": "MMreact"}
+{"image": "10.png", "question_id": 20, "question": "What are the appliances in the kitchen in this floorplan？", "answer": "The appliances in the kitchen in this floor plan are oven and DW(dishwasher).The size of the oven is not specified in the floor plan.\n", "model_id": "MMreact"}
+{"image": "10.png", "question_id": 21, "question": "What will you see when you enter through the entrance? and what is the dimension of it？", "answer": "When you enter through the entrance, you will see the foyer. The dimension of the foyer is 6'0\"x 6'34\".\n", "model_id": "MMreact"}
+{"image": "10.png", "question_id": 22, "question": "Can you summarize this floorplan based on the aspects I was interested in, in a few sentences？", "answer": "Sure, this floor plan is of a house with two bedrooms, a master bedroom with a bathroom and a second bedroom. The master bedroom is 12'10\"x 17'9\" and the second bedroom is 10'10\" x 14'7\". The kitchen is 7'5\" x 12'3\" and the terrace is 21'6\"x8'0\".The foyer is 6'0\"x6'34\" and the entrance leads to it. The kitchen has an oven and a dishwasher.", "model_id": "MMreact"}
+{"image": "11.png", "question_id": 23, "question": "Can you read the text in this flowchart and explain the process in detail?", "answer": "AI: Certainly! Based on the OCR text provided earlier, the flowchart represents the following process:\n1. Start the process.\n2. Input the first number (num1).\n3. Input the second number (num2).\n4. Compare num1 and num2.\n5. If num1 is greater than num2 (True condition):\na. Display num1.\n6. If num1 is not greater than num2 (False condition):\na. Display num2.\n7. Stop the process.\nIn summary, this flowchart describes a simple process that takes two numbers as inputs, compares them, and displays the larger number.\n", "model_id": "MMreact"}
+{"image": "11.png", "question_id": 24, "question": "Which step in the flowchart is drawn with a diamond shape?", "answer": "The step in the flowchart drawn with a diamond shape is the decision step where the comparison between num1 and num2 is made. This step is represented by the condition “num1 > num2”. The diamond shape is commonly used in flowcharts to represent decision points or conditional statements.", "model_id": "MMreact"}
+{"image": "12.png", "question_id": 25, "question": "Can you read the text in this flowchart and explain it to me?", "answer": " This flowchart describes a process for making tea. The process starts with\nturning on the kettle and waiting for it to\nboil. Then, the water is poured over a\nteabag in a cup and the tea is allowed to\nsteep for 2-3 minutes. Finally, milk and\nsugar are added to the tea. The process\nends with the stop.\n", "model_id": "MMreact"}
+{"image": "12.png", "question_id": 26, "question": "Can you transform this flowchart into a simple program in python?", "answer": " Can't recognize the image", "model_id": "MMreact"}
+{"image": "13.png", "question_id": 27, "question": "How much is Wilkins earning?", "answer": "Wilkins is earning $2,500.\n\n", "model_id": "MMreact"}
+{"image": "13.png", "question_id": 28, "question": "Which department is the person who has the highest salary from?", "answer": "The person who has the highest salary is from the Administration department. They are earning $3,000.\n\n", "model_id": "MMreact"}
+{"image": "13.png", "question_id": 29, "question": "This table is reporting monthly salary,can you calculate the yearly salary for each employee, and summarize it into a table and group them by department?", "answer": "Sure, I can calculate the yearly salary for each employee and group them by department.\n\nThe total yearly salary for Administration is $112,800. The total yearly salary for Finance is $75,600. The total yearly salary for Marketing is $86,400.\n", "model_id": "MMreact"}
+{"image": "13.png", "question_id": 30, "question": "Given the yearly bonus is 10%,12% and 15%for Administration,Finance,Marketing, respectively. Please provide me with the bonus for Emily, Max and Gloria.", "answer": "Sure, I can calculate the bonus for Emily, Max and Gloria.\nEmily is from the Administration department and her yearly salary is $24,000. Her bonus is $2,400(10% of $24,000).\nMax is from the Finance department and his yearly salary is $24,000.His bonus is $2,880 (12% of $24,000).\nGloria is from the Marketing department and her yearly salary is $21,600.Her bonus is $3,240(15% of $21,600).", "model_id": "MMreact"}
+{"image": "14.png", "question_id": 31, "question": " Answer question I.1.a. Think step-by-step.", "answer": "\nStart with Fourier's Law of heat conduction:\n\nq = -k * (dT/dx)\n\nwhere q is the heat flow (W/m²), k is the thermal \nconductivity (W/m·K), and dT/dx is the temperature \ngradient along the x-axis.\n\nIn a stationary regime, the heat flow q is constant \nthroughout the bar. Thus, we have:\n\n-k * (dT/dx) = constant\n\nIntegrate the equation with respect to x:\n\n∫(dT/dx) dx = ∫(constant/k) dx\n\nAfter integration, we get:\n\nT(x) = (constant/k) * x + C\n\nwhere C is the integration constant.\nApply the boundary conditions to find the constants:\n\nT(0) = To => C = To\n\nT(L) = Tb => (constant/k) * L + To = Tb\n\nSolve for the constant:\n\nconstant = k * (Tb - To) / L\n\nSubstitute the constant back into the T(x) equation:\n\nT(x) = (k * (Tb - To) / L) * x / k + To\n\nT(x) = ((Tb - To) / L) * x + To\n\nSo, the temperature T(x) at any point x along the \nthermal conductor bar is given by:\n\nT(x) = ((Tb - To) / L) * x + To", "model_id": "MMreact"}
+{"image": "15.png", "question_id": 32, "question": " Can you write a template for this website in HTML? Replace the joke and punchline with a real one.", "answer": "website", "model_id": "MMreact"}
+{"image": "16.png", "question_id": 33, "question": "How many championship rings did the player on the left win in his career?", "answer": "Kobe Bryant won 5 championship rings in his \ncareer.", "model_id": "MMreact"}
+{"image": "17.png", "question_id": 34, "question": "Describe this image. ", "answer": "无法回溯. ", "model_id": "MMreact"}
+{"image": "17.png", "question_id": 35, "question": "Explain why this image is funny.", "answer": "无法回溯", "model_id": "MMreact"}
+{"image": "18.png", "question_id": 36, "question": "Describe this image as detailed aspossible.", "answer": "Based on the information provided earlier, here is a more detailed description of the image:\nThe image captures a bustling street scene in a city or town, with a mix of people and vehicles. The sky is visible, and there are clouds in the background, suggesting an outdoor setting. The street is lined with buildings, including a white building with a window at the coordinates 54, 69. A tree with red lanterns hanging from it is located at coordinates 22, 73, adding a festive touch to the scene.\nThere are several people walking on the street at coordinates 48, 36, contributing to the lively atmosphere. Some of them are pedestrians, while others are riding motorcycles. The image is a bit blurry, making it difficult to discern specific details about the individuals. However, one person wearing a helmet can be seen at coordinates 78, 31.\nMotorcycles are a prominent feature in the image, with some parked on the street at coordinates 12, 27, and others in motion. The blurry images of motorcycles are located at coordinates 16, 25, 4, 21, and 9, 21.\nOverall, the image portrays a vibrant urban environment with a mix of people, vehicles, and architectural elements.\n", "model_id": "MMreact"}
+{"image": "19.png", "question_id": 37, "question": "Describe this image as detailed aspossible.", "answer": "To describe the image in detail, it is a concert or musical event taking place indoors. There is a group of people on stage performing, with some playing musical instruments like a guitar. The stage is equipped with various equipment for the performance. The image is somewhat blurry, making it difficult to clearly identify some of the people and objects. However, there is a crowd of people watching the band on stage, indicating that it is a popular event. The audience seems to be enjoying the performance, and the overall atmosphere appears to be lively and entertaining.", "model_id": "MMreact"}
+{"image": "20.png", "question_id": 38, "question": "Explain why this meme is funny.", "answer": "The image is of a dog lying on a blue surface with a blurry image of a cat’s ear. The text in the image says “MONDAY. JUST … MONDAY.”\nTo summerize, this image is likely a humorous or relatable depiction of a dog and a cat’s ear, with the text expressing the feeling of a typical Monday.", "model_id": "MMreact"}
+{"image": "21.png", "question_id": 39, "question": "Describe the image ", "answer": "无法回溯 ", "model_id": "MMreact"}
+{"image": "21.png", "question_id": 40, "question": "Can this happen in real world?", "answer": "无法回溯", "model_id": "MMreact"}
+{"image": "22.png", "question_id": 41, "question": "Describe the image ", "answer": "无法回溯  ", "model_id": "MMreact"}
+{"image": "22.png", "question_id": 42, "question": "Is this image common in real world?", "answer": "无法回溯", "model_id": "MMreact"}
+{"image": "23.png", "question_id": 43, "question": "Write brief HTML/JS to turn thismock-up into a colorful website,where the jokes are replaced by areal joke", "answer": "Here’s a simple HTML and JavaScript code to create a colorful website based on the mock-up, which fetches a real joke from an API and displays it on the page:\nhtml<br><!DOCTYPE html><br><html lang=\"en\"><br><head><br> <meta charset=\"UTF-8\"><br> <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\"><br> <title>Joke Website</title><br> <style><br> body {<br> font-family: Arial, sans-serif;<br> background-color: #f0f8ff;<br> color: #333;<br> text-align: center;<br> }<br> h1 {<br> font-size: 2.5rem;<br> color: #4b0082;<br> }<br> button {<br> background-color: #4b0082;<br> color: #fff;<br> font-size: 1.2rem;<br> padding: 10px 20px;<br> border: none;<br> cursor: pointer;<br> margin-top: 20px;<br> }<br> button:hover {<br> background-color: #6a5acd;<br> }<br> #joke-container {<br> font-size: 1.5rem;<br> margin-top: 30px;<br> }<br> </style><br></head><br><body><br> <h1>JA Joke Website</h1><br> <div id=\"joke-container\"></div><br> <button onclick=\"fetchJoke()\">Get a Joke</button><br> <script><br> async function fetchJoke() {<br> const response = await fetch('https://official-joke-api.appspot.com/random_joke');<br> const joke = await response.json();<br> document.getElementById('joke-container').innerHTML = `${joke.setup}<br><br>${joke.punchline}`;<br> }<br> fetchJoke();<br> </script><br></body><br></html><br>\nThis code creates a simple webpage with a title, a container to display the joke, and a button to fetch a new joke. The JavaScript code fetches a random joke from the “official-joke-api” and updates the joke container with the joke’s setup and punchline.", "model_id": "MMreact"}
+{"image": "24.png", "question_id": 44, "question": "What is the issue with my plant? Whatshould I do now?", "answer": "AI: 1. I do not have that information.\n2. This question requires further context.\n3. This question requires Internet search.\nAI: Based on the information from the internet search, there could be several reasons for the spots on your plant’s leaves:\n1. Poor air circulation and light penetration in the canopy.\n2. Wet conditions that promote disease.\n3. Nutrient deficiencies in the soil.\n4. Unsuitable environmental conditions such as high humidity, low temperatures, or heavy rainfall.\nTo address these issues, you can:\n1. Prune your trees or shrubs to improve air circulation and light penetration.\n2. Water your plants at the base using a drip or soaker hose to avoid wetting the leaves and prevent disease.\n3. Test your soil and provide the necessary nutrients if there are deficiencies.\n4. Monitor and adjust the environmental conditions around your plant, if possible.\nBy following these steps, you can help improve the health of your plant and prevent further leaf spots.", "model_id": "MMreact"}
+{"image": "25.png", "question_id": 45, "question": "Why this happens and how to fix it?", "answer": "AI: 1. This is not an image\n2. This question requires further context.\n3. This question requires Internet search.\nAI: 1. Washing machines produce foamy bubbles due to excess soap residue.\n2. To fix this issue, you can follow these steps:\n- Wipe the interior of the drum with a clean, dry cloth to remove excess soap residue and bubbles, changing it out as needed.\n- Set the machine to a full load and allow it to fill with water before adding 2 cups of distilled white vinegar.\n- Complete the wash cycle.\n- Run a second rinse cycle to remove lingering residue.\nIf there is still too much foam inside your washing machine, you can try pouring a cap full of fabric softener into the dispenser drawer and flush it down with cold water. This should react with the foam and help suppress it after a while.", "model_id": "MMreact"}
+{"image": "26.png", "question_id": 46, "question": "Can you write me a beautiful poemabout this image?", "answer": "AI: Of course! Here’s a poem inspired by the image of a man sitting with a dog during sunset:\nIn the golden hour’s warm embrace,\nA man and his dog find their place,\nUpon the grass so lush and green,\nA bond of love, so pure, unseen.\nThe sky above, a canvas grand,\nPainted hues by nature’s hand,\nAs the sun dips low, they share a kiss,\nIn this moment, purest bliss.\nTogether, they watch the day depart,\nA gentle warmth fills each heart,\nFor in this world, so vast and wide,\nThey find solace, side by side.\nIn the twilight’s soft, fading light,\nA friendship blooms, strong and bright,\nMan and dog, forever bound,\nIn the love and peace they’ve found.", "model_id": "MMreact"}
+{"image": "27.png", "question_id": 47, "question": "Can you write me a master rap songthat rhymes very well based on thisimage?", "answer": "AI: 1. This image is of an old woman wearing headphones and sunglasses\n2. This image is indoors\n3. This image has a human face\n4. This image has clothing\n5. This image has glasses and goggles\n6. This image has a person wearing\nHere’s a rap verse inspired by the image:\n(Verse 1)\nYo, listen up, I got a story to tell,\nAbout an old lady rockin’ it, doin’ so well,\nIndoors she’s chillin’, headphones on her ears,\nSunglasses on her face, showin’ no fears.\n(Chorus)\nShe’s the queen of the scene, the master of disguise,\nGlasses and goggles, she’s got the wise eyes,\nWearin’ her clothes, she’s got the style and grace,\nA human face, a person wearin’, she’s got the taste.\n(Verse 2)\nShe’s been around the block, seen it all before,\nBut she ain’t done yet, she’s ready for more,\nHeadphones pumpin’, the beats are alive,\nThis old lady’s rockin’, she’s ready to dive.\n(Chorus)\nShe’s the queen of the scene, the master of disguise,\nGlasses and goggles, she’s got the wise eyes,\nWearin’ her clothes, she’s got the style and grace,A human face, a person wearin’, she’s got the taste.\n(Bridge)\nSo gather ‘round, and listen close,\nTo the tale of the lady who’s got the most,\nHeadphones and sunglasses, she’s got the look,\nThe master of the scene, she’s got us all shook.\n(Chorus)\nShe’s the queen of the scene, the master of disguise,\nGlasses and goggles, she’s got the wise eyes,\nWearin’ her clothes, she’s got the style and grace,\nA human face, a person wearin’, she’s got the taste.\n(Outro)\nAnd as the beat fades, remember her name,\nThe old lady rockin’, she’s got the fame,\nHeadphones and sunglasses, she’s the one,\nThe master of the scene, she’s just begun.", "model_id": "MMreact"}
+{"image": "28.png", "question_id": 48, "question": "Can you write a story around thisimage for my child?", "answer": "AI: Once upon a time in a magical forest, there lived a friendly bear named Benny, a curious cat named Cleo, and a wise rabbit named Remy. They were the best of friends and loved to explore the forest together, discovering new places and meeting new creatures.\nOne sunny afternoon, Benny, Cleo, and Remy decided to have a tea party in their favorite clearing in the forest. They set up a small table and placed a beautiful plate of delicious treats on it. Benny brought honey from the bees, Cleo brought fresh berries she had picked, and Remy brought crunchy carrots from his garden.\nAs they sat down to enjoy their tea, they began to share stories of their recent adventures. Benny told them about the time he helped a family of ducks cross a river, Cleo shared her encounter with a wise old owl, and Remy recounted his daring escape from a cunning fox.\nAs they laughed and chatted, they noticed a small branch on the table. It was a magical branch that had the power to grant wishes. Each of them took turns holding the branch and making a wish.\nBenny wished for the strength to protect his friends, Cleo wished for the ability to climb the tallest trees, and Remy wished for the wisdom to guide them on their adventures.\nAs they finished their tea and treats, they felt a warm glow surround them, and they knew their wishes had been granted. With their newfound abilities, Benny, Cleo, and Remy continued to explore the magical forest, creating even more wonderful memories and sharing many more delightful tea parties together.\nAnd so, the three friends lived happily ever after, spreading joy and friendship throughout the magical forest. The end.", "model_id": "MMreact"}
+{"image": "29.png", "question_id": 49, "question": "I designed a character for my game asshown in the image. Can you write mea background story for him?", "answer": "无法回溯", "model_id": "MMreact"}
+{"image": "30.png", "question_id": 50, "question": "We design new mugs shown in theimage. Can you help us write anadvertisement?", "answer": "AI: Of course! Here’s a draft advertisement for the new cat-themed mugs:\n“Introducing our latest collection of adorable cat-themed mugs! 🐱☕\nAdd a touch of feline charm to your morning coffee or tea with these beautifully designed blue mugs featuring cute cat faces. Our mugs are not only eye-catching but also made of high-quality materials, ensuring durability and a comfortable grip.\nPerfect for cat lovers and enthusiasts, these mugs make an excellent gift for friends, family, or even yourself. Enjoy your favorite hot beverage in style while showcasing your love for our furry friends.\nDon’t miss out on this purr-fect addition to your kitchenware collection. Order your cat-themed mugs today and experience the joy of sipping your favorite drink with a feline twist!\n🐾 Limited stock available. Get yours now! 🐾”\nFeel free to modify the text as needed to better suit your needs and preferences.", "model_id": "MMreact"}
+{"image": "31.png", "question_id": 51, "question": "Help me write an advertisement forthis.", "answer": "AI: Introducing our unique and eye-catching Bird Statue Lamp! Add a touch of elegance and whimsy to your home with this stunning piece of art.\nFeatures:\n- Beautifully crafted bird statue with a built-in light bulb\n- Perfect for indoor use, enhancing any room’s ambiance\n- A conversation starter that will captivate your guests\n- Ideal for bird lovers and art enthusiasts alike\nIlluminate your space with this one-of-a-kind Bird Statue Lamp. Its intricate design and attention to detail make it a must-have addition to your home decor. Don’t miss out on this limited edition piece – order yours today!", "model_id": "MMreact"}
+{"image": "32.png", "question_id": 52, "question": "Who is the person in the image?", "answer": "The person in the image is Elon Musk, who is holding a cartoon dog wearing a vest.", "model_id": "MMreact"}
+{"image": "33.png", "question_id": 53, "question": "Describe this image. ", "answer": "无法回溯. ", "model_id": "MMreact"}
+{"image": "33.png", "question_id": 54, "question": "What do you think about the person'sbehavior?", "answer": "无法回溯.", "model_id": "MMreact"}
+{"image": "34.png", "question_id": 55, "question": "Describe about the image ", "answer": "无法回溯. ", "model_id": "MMreact"}
+{"image": "34.png", "question_id": 56, "question": "Do you have anything to say to theyoung boy in the image?", "answer": "无法回溯.", "model_id": "MMreact"}
+{"image": "35.png", "question_id": 57, "question": "Can you give me a short introductionto this movie?", "answer": " AI: Based on the search results, the movie you are referring to is likely “The Freshman” (1990). Directed and written by Andrew Bergman, the film stars Marlon Brando, Matthew Broderick, and Bruno Kirby. The story revolves around a New York City film school student who accepts a job with a local mobster resembling a famous cinema godfather. The mobster takes the young man under his wing, demanding total loyalty in return.", "model_id": "MMreact"}
+{"image": "36.png", "question_id": 58, "question": "Can you give me a detailedintroduction to this painting, andalso the artist?", "answer": " AI: 1. I do not have that information.\n2. This question requires Internet search.\nAI: The painting you are referring to is “The Persistence of Memory” (1931) by the famous Spanish artist Salvador Dalí. This iconic painting is one of the most recognizable surrealist artworks in the world. It features melting clocks draped over various objects, including a table, a pot, and a piece of cake. The painting is known for its surreal and dreamlike imagery, which has captivated audiences for decades. Salvador Dalí was a prominent figure in the Surrealist movement, and his unique artistic style and imaginative approach to art have made him a lasting influence in the world of art.", "model_id": "MMreact"}
+{"image": "37.png", "question_id": 59, "question": "How should I do to cook this?", "answer": "AI: 1. I do not have that information.\n2. This question requires Internet search.\nAI: To cook ramen soup with eggs and meat, follow these steps:\n1. Prepare the broth: You can use chicken wings or any other preferred protein to make the broth. Simmer the chicken wings in water for about 10 minutes.\n2. Cook the vegetables: Add carrots and scallions to the broth, and cook for an additional 20 minutes.\n3. Make soft boiled eggs: Boil the eggs for about 4-5 minutes for soft boiled eggs. For hard boiled eggs, extend the boiling time to 10 minutes. After boiling, place the eggs in a bowl filled with ice-cold water.\n4. Cook the ramen noodles: Follow the package instructions to cook the ramen noodles.\n5. Assemble the ramen: Divide the cooked ramen noodles into 4 bowls. Pour the broth with vegetables and meat over the noodles. Peel the eggs, cut them in half, and place them on top of the noodles.\nEnjoy your delicious homemade ramen soup with eggs and meat!", "model_id": "MMreact"}
+{"image": "38.png", "question_id": 60, "question": "Explain the advantages of this product.", "answer": "AI: 1. I do not have enough information about the product.\n2. This question requires further context.\n3. This question requires Internet search.\nAI: Based on the information from the internet search, the advantages of a black sports car are:\n1. Aesthetic appeal: Black color enhances a car’s curves and lines more than any other color, making the car appear larger and sleeker than it actually is.\n2. Visual aura: The black color leads your eyesight beyond the actual color of the car and into the lines that build it, creating a visually appealing effect.\nHowever, there are also some disadvantages to consider:\n1. Heat absorption: Black and darker-colored cars absorb more heat, making the car hotter, especially during sunny days.\n2. Dirt visibility: Black cars tend to show dirt and dust more easily compared to lighter-colored cars.", "model_id": "MMreact"}
+{"image": "39.png", "question_id": 61, "question": "Tell me something about the history of this place.", "answer": "AI: 1. I do not have that information.\n2. This question requires further context.\n3. This question requires Internet search.\nAI: The history of castles dates back to the early forms of simple wooden palisades, which sometimes had earthworks surrounding a camp and a permanent wooden tower in the center. These structures evolved into motte and bailey castles, which featured a wall encircling an open space or courtyard (bailey) and a natural or artificial hill (motte) with a wooden tower built on top.\nIn Western Europe, the development of castles accelerated from the 9th century. Fortifications built in France during the 10th century often included a high mound encircled by a ditch and surmounted by the leader’s stronghold, as seen in the castles at Blois and Saumur.", "model_id": "MMreact"}
+{"image": "40.png", "question_id": 62, "question": "Is this photo unusual?", "answer": "Yes, this photo is unusual because it shows a house upside down, which is not a common sight in everyday life. The upside-down house creates an interesting and unique visual effect.\n", "model_id": "MMreact"}
+{"image": "40.png", "question_id": 63, "question": "How could someone get out of the house?", "answer": " In an upside-down house, the exit would still be through a door, but the door would be located at the top of the house instead of the bottom. To get out of the house, one would need to use stairs, a ladder, or some other means to reach the door. However, it’s important to note that upside-down houses are typically built as tourist attractions or art installations and are not meant for practical living. In a real-life scenario, living in an upside-down house would be challenging and potentially dangerous due to the unconventional layout and potential safety hazards.", "model_id": "MMreact"}
+{"image": "41.png", "question_id": 64, "question": "What are shown in the photo?", "answer": "In the photo, the following objects and descriptions are shown:\n1. A man throwing a frisbee\n2. A rooster jumping in the air\n3. A man throwing a chicken\n4. A close-up of a bird’s head\n5. A man in a green shirt\n6. A close-up of a rooster\n7. A man running with his arms out\n8. A blurry hand with a ring on it\n9. A man with his mouth open\nThe main focus of the photo is a man throwing a chicken in an outdoor setting.\n", "model_id": "MMreact"}
+{"image": "41.png", "question_id": 65, "question": "What does the man feel and why?", "answer": "It’s difficult to determine the exact emotions of the man in the image without more context. However, based on the image description, the man has his mouth open and is running with his arms out, which could suggest excitement, surprise, or even fear. The reason for his emotions could be related to the act of throwing the chicken or the presence of the rooster jumping in the air.", "model_id": "MMreact"}
+{"image": "42.png", "question_id": 66, "question": " What are the ingredients I need to make this? ", "answer": "To make a pepperoni pizza, you will need the following ingredients:\n1. Pizza dough:\n- All-purpose flour or bread flour\n- Active dry yeast\n- Sugar\n- Salt\n- Warm water\n- Olive oil\n2. Pizza sauce:\n- Crushed tomatoes or tomato sauce\n- Olive oil\n- Garlic\n- Salt\n- Sugar\n- Dried oregano\n- Dried basil\n- Ground black pepper\n3. Toppings:\n- Mozzarella cheese\n- Pepperoni slices\n4. Optional additional toppings:\n- Sliced onions\n- Sliced bell peppers\n- Sliced mushrooms\n- Black olives\n- Fresh basil leaves\nYou can adjust the ingredients according to your taste and preferences.\n", "model_id": "MMreact"}
+{"image": "42.png", "question_id": 67, "question": " What is the first step?", "answer": "The first step in making a pepperoni pizza is to prepare the pizza dough. Here’s how to do it:\n1. In a large mixing bowl, combine 1 cup of warm water (around 110°F or 45°C) with 1 tablespoon of sugar and 1 packet (2 1/4 teaspoons) of active dry yeast. Let the mixture sit for about 5 minutes until it becomes frothy.\n2. Add 2 tablespoons of olive oil, 2 1/2 cups of all-purpose or bread flour, and 1 teaspoon of salt to the yeast mixture. Mix the ingredients until a dough forms.\n3. Knead the dough on a lightly floured surface for about 5-7 minutes, or until it becomes smooth and elastic.\n4. Lightly oil a clean bowl and place the dough inside. Cover the bowl with a damp cloth or plastic wrap and let the dough rise in a warm place for about 1-2 hours, or until it has doubled in size.\nOnce the dough has risen, you can proceed with making the pizza sauce, prepping the toppings, and assembling the pizza.", "model_id": "MMreact"}
+{"image": "43.png", "question_id": 68, "question": " Which city is this? ", "answer": "AI: 1. I do not have that information.\n2. This question requires Internet search.\nAI: Based on the search results, there are two possible cities:\n1. Singapore: The Merlion statue, which is a lion with a fish tail, is a famous landmark in Singapore. It is located near the mouth of the Singapore River and was unveiled in 1972.\n2. Paris: The Fontaine aux Lions de Nubie is a historical water fountain with lion statues located in Parc de la Villette, Paris.\nWithout more specific details from the image, it is difficult to determine the exact city. However, these two cities are potential matches for the description provided. \n", "model_id": "MMreact"}
+{"image": "43.png", "question_id": 69, "question": " Why do you think so?", "answer": "AI: I think these two cities, Singapore and Paris, are potential matches for the image description because they both have notable water fountains with lion statues, which is one of the key features mentioned in the image description. Additionally, both cities have bridges and other landmarks that could match the other elements in the image.\nHowever, it’s important to note that without more specific details or a clearer image, it’s difficult to determine the exact city with certainty. The information provided is based on the search results and the similarities between the image description and the landmarks found in these two cities.", "model_id": "MMreact"}
+{"image": "44.png", "question_id": 70, "question": " What happened at the end of this movie?", "answer": "AI: 1. There is an image in the input\nAI: 1. A man and woman are standing on a boat\n2. The man is holding the woman\n3. There is a face of a woman detected in this image.\n4. Facial recognition can detect celebrity names for these faces\nAI: To summerize, this image shows a man and woman standing on a boat, with the man holding the woman. They are enjoying a beautiful outdoor setting with a sky, clouds, and sunset. There are no celebrities detected in the image. \n", "model_id": "MMreact"}
+{"image": "44.png", "question_id": 71, "question": " Did Leonardo Dicaprio's character survive?", "answer": "AI: 1. I do not have that information.\n2. This question requires further context.\n3. This question requires Internet search.\nPlease provide the name of the movie you are referring to, so I can help you with the information about its ending.\n1. I do not have that information.\n2. This question requires further context.\n3. This question requires Internet search.\nPlease provide the name of the movie you are referring to, so I can help you with the information about its ending.\nI apologize for the repeated response. Please provide the name of the movie you are referring to, so I can help you with the information about its ending.\nI apologize for the repeated response. Please provide the name of the movie you are referring to, so I can help you with the information about its ending.\nI apologize for the repeated response. Please provide the name of the movie you are referring to, so I can help you with the information about its ending.", "model_id": "MMreact"}
+{"image": "45.png", "question_id": 72, "question": " What is in the photo? ", "answer": "无法回溯. ", "model_id": "MMreact"}
+{"image": "45.png", "question_id": 73, "question": " What is the nose made of?", "answer": "无法回溯", "model_id": "MMreact"}
+{"image": "46.png", "question_id": 74, "question": " Write a conversation between the two animals.", "answer": "Cat: Hey there, buddy! It’s pretty cold out here, isn’t it?\nDog: Yeah, it sure is! I’m glad I have my thick fur to keep me warm.\nCat: Me too! But you know what’s even better? Sitting on your warm back!\nDog: Haha, I don’t mind at all. It’s nice to have a friend to keep me company in this chilly weather.\nCat: Agreed! Plus, I get a better view of the snow-covered landscape from up here.\nDog: True! It’s beautiful, isn’t it? I love how the snow sparkles in the sunlight.\nCat: Absolutely! It’s like a winter wonderland. Just be careful not to slip on the ice, okay?\nDog: Don’t worry, I’ll be extra cautious. We wouldn’t want you to fall off my back now, would we?\nCat: No, we wouldn’t! Thanks for being such a good friend, and for giving me a lift in this snowy weather.\nDog: Anytime, my feline friend! That’s what friends are for. Now, let’s enjoy this winter walk together!", "model_id": "MMreact"}
+{"image": "47.png", "question_id": 75, "question": "Who is the man on the right?", "answer": "I’m sorry, but I cannot identify the man on the right as the facial recognition did not detect any other celebrities in the image. It could be a friend, colleague, or a fan, but without more information, I cannot provide a specific identity for him.\n", "model_id": "MMreact"}
+{"image": "47.png", "question_id": 76, "question": "Who is the man on the left?", "answer": "The man on the left in the image is Yao Ming, a retired Chinese professional basketball player. He played for the Shanghai Sharks of the Chinese Basketball Association (CBA) and the Houston Rockets of the National Basketball Association (NBA).\n", "model_id": "MMreact"}
+{"image": "47.png", "question_id": 77, "question": "Who is the taller man?", "answer": "The taller man in the image is Yao Ming. He is a retired Chinese professional basketball player known for his height, which contributed to his successful career in the NBA and CBA.\n", "model_id": "MMreact"}
+{"image": "47.png", "question_id": 78, "question": "Who is wearing red?", "answer": "Based on the information provided earlier, the image description did not mention any specific colors. Therefore, I cannot determine who is wearing red in the image.\n", "model_id": "MMreact"}
+{"image": "47.png", "question_id": 79, "question": "Who is the man with the water bottle in his hand?", "answer": "The man holding the water bottle in the image was not identified as a celebrity. Without more information, I cannot provide a specific identity for him. He could be a friend, colleague, or a fan of Yao Ming, who is also in the image.", "model_id": "MMreact"}
+{"image": "48.png", "question_id": 80, "question": "Introduce the movie character in this picture.", "answer": "There is no image in the input. I need an image to analyze\u000bPlease provide an image of the movie character you would like me to introduce.\n\n", "model_id": "MMreact"}
+{"image": "49.png", "question_id": 81, "question": "Analyze the tactical characteristics and lineups of both sides and predict who will win.", "answer": "I do not have enough information about the teams. This question requires further context. Please provide more information about the teams, their recent performance, or the context of the competition, so I can help you with your prediction.\n\n", "model_id": "MMreact"}
+{"image": "50.png", "question_id": 82, "question": "Tell me about this work of art", "answer": "AI: 1. I do not have that information.\n2. This question requires Internet search.\nAI: 1. The painting is “The Starry Night” by Vincent van Gogh\n2. It was created in 1889\n3. The painting is oil on canvas\n4. The size of the painting is 29 x 36 1/4 inches (73.7 x 92.1 cm)\n5. The painting is part of MoMA’s Online Collection\nTo summarize, the work of art you are referring to is “The Starry Night” by Vincent van Gogh, created in 1889. It is an oil-on-canvas painting featuring a night sky with swirling blue colors, a glowing yellow crescent moon, and radiating stars over a small hillside village. The painting is considered one of van Gogh’s most celebrated works and is part of MoMA’s Online Collection.", "model_id": "MMreact"}
diff --git a/models/mPLUG_Owl/OwlEval/answer/blip2_13b_answer.jsonl b/models/mPLUG_Owl/OwlEval/answer/blip2_13b_answer.jsonl
new file mode 100644
index 0000000..e62501d
--- /dev/null
+++ b/models/mPLUG_Owl/OwlEval/answer/blip2_13b_answer.jsonl
@@ -0,0 +1,82 @@
+{"image": "1.png", "question_id": 1, "question": "What is funny about this image? Describe it panel by panel.", "answer": "the phone is plugged into a charger", "model_id": "BLIP2"}
+{"image": "2.png", "question_id": 2, "question": "Why would a person find this image funny?", "answer": "a squirrel is holding a camera", "model_id": "BLIP2"}
+{"image": "3.png", "question_id": 3, "question": "What is unusual about this image?", "answer": "A man is riding on the back of a car", "model_id": "BLIP2"}
+{"image": "4.png", "question_id": 4, "question": "Is there text in the image? If so, can you read it and explain it to me? ", "answer": " i'm pregnant\n", "model_id": "BLIP2"}
+{"image": "4.png", "question_id": 5, "question": "What is funny about it?", "answer": "i'm pregnant", "model_id": "BLIP2"}
+{"image": "5.png", "question_id": 6, "question": "Where is the frisbee in the image? ", "answer": " in the air\n", "model_id": "BLIP2"}
+{"image": "5.png", "question_id": 7, "question": "Where is the man in the image?", "answer": " in the air\n", "model_id": "BLIP2"}
+{"image": "5.png", "question_id": 8, "question": "What is the spatial relation between the frisbee and the man?", "answer": " above", "model_id": "BLIP2"}
+{"image": "6.png", "question_id": 9, "question": "What should I do after kneading the dough, before dividing it into pieces?", "answer": " knead the dough for 10 minutes\n", "model_id": "BLIP2"}
+{"image": "6.png", "question_id": 10, "question": "How long should I put in warm places?", "answer": " for 1 hour", "model_id": "BLIP2"}
+{"image": "7.png", "question_id": 11, "question": "What is the total amount?", "answer": "3838 do", "model_id": "BLIP2"}
+{"image": "8.png", "question_id": 12, "question": "Excluding the regular cab, what is the average total fueling cost?", "answer": "无法回溯", "model_id": "BLIP2"}
+{"image": "9.png", "question_id": 13, "question": "Excluding Ford F150 Lightning, what is the average total charging cost? Now how much cheaper is the average charging cost over the electric cars cluding Ford F150 Lightning) than the average fueling cost over the gasoline cars cluding Ford F150", "answer": " 0 \n", "model_id": "BLIP2"}
+{"image": "9.png", "question_id": 14, "question": "gular Cab))?Let's think step-by-step.", "answer": " The average total fueling cost over the gasoline cars is 0", "model_id": "BLIP2"}
+{"image": "10.png", "question_id": 15, "question": "How many bedrooms are there in this floor plan?", "answer": " 2\n", "model_id": "BLIP2"}
+{"image": "10.png", "question_id": 16, "question": "What is the dimension of Master Bedroom？", "answer": ") 450 sq ft\n", "model_id": "BLIP2"}
+{"image": "10.png", "question_id": 17, "question": "Is there a bathroom in master bedroom?", "answer": " yes\n", "model_id": "BLIP2"}
+{"image": "10.png", "question_id": 18, "question": "What is the dimension of Second Bedroom？", "answer": " 450 sq ft\n", "model_id": "BLIP2"}
+{"image": "10.png", "question_id": 19, "question": "Is the terrace bigger than the kitchen?", "answer": " no\n", "model_id": "BLIP2"}
+{"image": "10.png", "question_id": 20, "question": "What are the appliances in the kitchen in this floorplan？", "answer": " stainless steel appliances\n", "model_id": "BLIP2"}
+{"image": "10.png", "question_id": 21, "question": "What will you see when you enter through the entrance? and what is the dimension of it？", "answer": " a foyer with a coat closet and a mudroom\n", "model_id": "BLIP2"}
+{"image": "10.png", "question_id": 22, "question": "Can you summarize this floorplan based on the aspects I was interested in, in a few sentences？", "answer": " The floor plan has 2 bedrooms, 2 bathrooms, a foyer with a coat closet", "model_id": "BLIP2"}
+{"image": "11.png", "question_id": 23, "question": "Can you read the text in this flowchart and explain the process in detail?", "answer": " a c program is a computer program that is used to perform a specific task\n", "model_id": "BLIP2"}
+{"image": "11.png", "question_id": 24, "question": "Which step in the flowchart is drawn with a diamond shape?", "answer": " if the input is false", "model_id": "BLIP2"}
+{"image": "12.png", "question_id": 25, "question": "Can you read the text in this flowchart and explain it to me?", "answer": " a process flow chart for a coffee machine\n", "model_id": "BLIP2"}
+{"image": "12.png", "question_id": 26, "question": "Can you transform this flowchart into a simple program in python?", "answer": " a coffee machine is a machine that brews coffee", "model_id": "BLIP2"}
+{"image": "13.png", "question_id": 27, "question": "How much is Wilkins earning?", "answer": " a salary of $ 30,000\n", "model_id": "BLIP2"}
+{"image": "13.png", "question_id": 28, "question": "Which department is the person who has the highest salary from?", "answer": " the finance department\n", "model_id": "BLIP2"}
+{"image": "13.png", "question_id": 29, "question": "This table is reporting monthly salary,can you calculate the yearly salary for each employee, and summarize it into a table and group them by department?", "answer": " yes\n", "model_id": "BLIP2"}
+{"image": "13.png", "question_id": 30, "question": "Given the yearly bonus is 10%,12% and 15%for Administration,Finance,Marketing, respectively. Please provide me with the bonus for Emily, Max and Gloria.", "answer": " Emily: 10%, Max: 12%, Gloria: 15%", "model_id": "BLIP2"}
+{"image": "14.png", "question_id": 31, "question": " Answer question I.1.a. Think step-by-step.", "answer": "The inverse square law is a law that states that for every action.", "model_id": "BLIP2"}
+{"image": "15.png", "question_id": 32, "question": " Can you write a template for this website in HTML? Replace the joke and punchline with a real one.", "answer": "无法回溯", "model_id": "BLIP2"}
+{"image": "16.png", "question_id": 33, "question": "How many championship rings did the player on the left win in his career?", "answer": "five", "model_id": "BLIP2"}
+{"image": "17.png", "question_id": 34, "question": "Describe this image. ", "answer": " another wild saturday night\n", "model_id": "BLIP2"}
+{"image": "17.png", "question_id": 35, "question": "Explain why this image is funny.", "answer": " a cat in a blue suit eating cookies", "model_id": "BLIP2"}
+{"image": "18.png", "question_id": 36, "question": "Describe this image as detailed aspossible.", "answer": "a city street with people walking and riding motorcycles", "model_id": "BLIP2"}
+{"image": "19.png", "question_id": 37, "question": "Describe this image as detailed aspossible.", "answer": "a concert with a large crowd of people", "model_id": "BLIP2"}
+{"image": "20.png", "question_id": 38, "question": "Explain why this meme is funny.", "answer": "a dog laying down on a wooden floor with the caption monday just monday", "model_id": "BLIP2"}
+{"image": "21.png", "question_id": 39, "question": "Describe the image ", "answer": " a wolf and a lamb\n", "model_id": "BLIP2"}
+{"image": "21.png", "question_id": 40, "question": "Can this happen in real world?", "answer": " yes", "model_id": "BLIP2"}
+{"image": "22.png", "question_id": 41, "question": "Describe the image ", "answer": " a cactus in a frozen lake\n", "model_id": "BLIP2"}
+{"image": "22.png", "question_id": 42, "question": "Is this image common in real world?", "answer": " no", "model_id": "BLIP2"}
+{"image": "23.png", "question_id": 43, "question": "Write brief HTML/JS to turn thismock-up into a colorful website,where the jokes are replaced by areal joke", "answer": "a joke website contents of date 1 punchline 1", "model_id": "BLIP2"}
+{"image": "24.png", "question_id": 44, "question": "What is the issue with my plant? Whatshould I do now?", "answer": "aphids are the most common pests on coffee plants", "model_id": "BLIP2"}
+{"image": "25.png", "question_id": 45, "question": "Why this happens and how to fix it?", "answer": "a faulty washing machine can cause a lot of water to leak out", "model_id": "BLIP2"}
+{"image": "26.png", "question_id": 46, "question": "Can you write me a beautiful poemabout this image?", "answer": "i love my dog he's my best friend", "model_id": "BLIP2"}
+{"image": "27.png", "question_id": 47, "question": "Can you write me a master rap songthat rhymes very well based on thisimage?", "answer": "i'm a rapping grandma", "model_id": "BLIP2"}
+{"image": "28.png", "question_id": 48, "question": "Can you write a story around thisimage for my child?", "answer": "a bear, a cat and a rabbit are having tea in the woods", "model_id": "BLIP2"}
+{"image": "29.png", "question_id": 49, "question": "I designed a character for my game asshown in the image. Can you write mea background story for him?", "answer": "he is a monk who has a brain that is a replica of his", "model_id": "BLIP2"}
+{"image": "30.png", "question_id": 50, "question": "We design new mugs shown in theimage. Can you help us write anadvertisement?", "answer": "mugs are a great way to show your love for cats", "model_id": "BLIP2"}
+{"image": "31.png", "question_id": 51, "question": "Help me write an advertisement forthis.", "answer": "a gold toucan table lamp", "model_id": "BLIP2"}
+{"image": "32.png", "question_id": 52, "question": "Who is the person in the image?", "answer": "elon musk", "model_id": "BLIP2"}
+{"image": "33.png", "question_id": 53, "question": "Describe this image. ", "answer": " a man is crossing the street in front of a bus\n", "model_id": "BLIP2"}
+{"image": "33.png", "question_id": 54, "question": "What do you think about the person'sbehavior?", "answer": " he is crossing the street in front of a bus", "model_id": "BLIP2"}
+{"image": "34.png", "question_id": 55, "question": "Describe about the image ", "answer": " a young boy stands on a ledge looking out over a city\n", "model_id": "BLIP2"}
+{"image": "34.png", "question_id": 56, "question": "Do you have anything to say to theyoung boy in the image?", "answer": " i would say that he should be careful", "model_id": "BLIP2"}
+{"image": "35.png", "question_id": 57, "question": "Can you give me a short introductionto this movie?", "answer": "the godfather is a gangster film directed by martin scorsese and", "model_id": "BLIP2"}
+{"image": "36.png", "question_id": 58, "question": "Can you give me a detailedintroduction to this painting, andalso the artist?", "answer": "the persistence of memory by salvador dali", "model_id": "BLIP2"}
+{"image": "37.png", "question_id": 59, "question": "How should I do to cook this?", "answer": "Add the pork, carrots, and onions to a large pot and cook over medium heat", "model_id": "BLIP2"}
+{"image": "38.png", "question_id": 60, "question": "Explain the advantages of this product.", "answer": "The audi e-tron quattro concept is a plug-in hybrid electric sports car that has a range of 310 miles and can accelerate from 0 to 60 mph in just 4 seconds.", "model_id": "BLIP2"}
+{"image": "39.png", "question_id": 61, "question": "Tell me something about the history of this place.", "answer": "The great wall of china was built in 221 bc by emperor qin shihuang to protect the emperor's capital from invaders from the north.", "model_id": "BLIP2"}
+{"image": "40.png", "question_id": 62, "question": "Is this photo unusual?", "answer": "Yes, it's a house that looks like it's upside down. ", "model_id": "BLIP2"}
+{"image": "40.png", "question_id": 63, "question": "How could someone get out of the house?", "answer": "It has a slide on the side of the\nhouse.", "model_id": "BLIP2"}
+{"image": "41.png", "question_id": 64, "question": "What are shown in the photo?", "answer": " A man and a chicken. ", "model_id": "BLIP2"}
+{"image": "41.png", "question_id": 65, "question": "What does the man feel and why?", "answer": " He is scared of the chicken because it is flying at him.", "model_id": "BLIP2"}
+{"image": "42.png", "question_id": 66, "question": " What are the ingredients I need to make this? ", "answer": " Pepperoni, mozzarella cheese, pizza\nsauce, olive oil, salt, pepper, basil.\n", "model_id": "BLIP2"}
+{"image": "42.png", "question_id": 67, "question": " What is the first step?", "answer": "Place the pizza dough on a baking\nsheet, brush with olive oil, sprinkle\nwith salt, pepper, and basil.", "model_id": "BLIP2"}
+{"image": "43.png", "question_id": 68, "question": " Which city is this? ", "answer": "Singapore.\n", "model_id": "BLIP2"}
+{"image": "43.png", "question_id": 69, "question": " Why do you think so?", "answer": "The city has a statue of a merlion.", "model_id": "BLIP2"}
+{"image": "44.png", "question_id": 70, "question": " What happened at the end of this movie?", "answer": "The titanic sank.\n", "model_id": "BLIP2"}
+{"image": "44.png", "question_id": 71, "question": " Did Leonardo Dicaprio's character survive?", "answer": "No, he drowned", "model_id": "BLIP2"}
+{"image": "45.png", "question_id": 72, "question": " What is in the photo? ", "answer": "A pizza that looks like a cat.\n", "model_id": "BLIP2"}
+{"image": "45.png", "question_id": 73, "question": " What is the nose made of?", "answer": "A slice of pepperoni.", "model_id": "BLIP2"}
+{"image": "46.png", "question_id": 74, "question": "Write a conversation between the two animals.", "answer": "cat: hey dog, can i ride on your back?\ndog: sure, why not?\ncat: i'm tired of walking in the snow.", "model_id": "BLIP2"}
+{"image": "47.png", "question_id": 75, "question": "Who is the man on the right?", "answer": " li zhiqiang\n", "model_id": "BLIP2"}
+{"image": "47.png", "question_id": 76, "question": "Who is the man on the left?", "answer": " li zhiqiang\n", "model_id": "BLIP2"}
+{"image": "47.png", "question_id": 77, "question": "Who is the taller man?", "answer": " li zhiqiang\n", "model_id": "BLIP2"}
+{"image": "47.png", "question_id": 78, "question": "Who is wearing red?", "answer": " li zhiqiang\n", "model_id": "BLIP2"}
+{"image": "47.png", "question_id": 79, "question": "Who is the man with the water bottle in his hand?", "answer": " li zhiqiang", "model_id": "BLIP2"}
+{"image": "48.png", "question_id": 80, "question": "Introduce the movie character in this picture.", "answer": "person", "model_id": "BLIP2"}
+{"image": "49.png", "question_id": 81, "question": "Analyze the tactical characteristics and lineups of both sides and predict who will win.", "answer": "argentina vs france", "model_id": "BLIP2"}
+{"image": "50.png", "question_id": 82, "question": "Tell me about this work of art", "answer": "the starry night", "model_id": "BLIP2"}
diff --git a/models/mPLUG_Owl/OwlEval/answer/llava_13b_answer.jsonl b/models/mPLUG_Owl/OwlEval/answer/llava_13b_answer.jsonl
new file mode 100644
index 0000000..c2539f7
--- /dev/null
+++ b/models/mPLUG_Owl/OwlEval/answer/llava_13b_answer.jsonl
@@ -0,0 +1,82 @@
+{"image": "1.jpg", "question_id": 1, "question": "What is funny about this image? Describe it panel by panel.", "answer": "The funny part of this image is that there are two different phone chargers trying to charge the same cellphone. It's as if they can't seem to agree on who gets to charge the phone first, or maybe they are fighting over the dominance of charging the device. The scene also includes a person trying to manage the situation, possibly the owner of the phone, and a teddy bear, which adds an amusing touch to the whole situation. In the end, it's just a lighthearted depiction of the everyday struggle of managing multiple phone chargers when you have devices with multiple USB ports.", "model_id": "llava-13b"}
+{"image": "2.jpg", "question_id": 2, "question": "Why would a person find this image funny?", "answer": "People would find this image funny because it features a cartoon squirrel holding a camera and taking a photo, which is an unusual and amusing sight. Squirrels are not typically associated with such human-like activities, and the idea of a rodent using a camera is both unexpected and humorous. Additionally, the smiling squirrel adds to the light-hearted and whimsical nature of the image, making it even more enjoyable and entertaining for viewers.", "model_id": "llava-13b"}
+{"image": "3.jpg", "question_id": 3, "question": "What is unusual about this image?", "answer": "The unusual aspect of this image is a man ironing clothes on the back of a minivan or van. This is not a typical place to perform this activity, as one would usually iron clothes in a more stationary and safe location, such as a home, using a regular ironing board. The scene depicted in the image is peculiar as it involves a makeshift ironing setup on a vehicle, which can be both unsafe and unconventional. Additionally, it is not clear how the man is able to maintain balance and stability while ironing clothes in such an unstable environment.", "model_id": "llava-13b"}
+{"image": "4.jpg", "question_id": 4, "question": "Is there text in the image? If so, can you read it and explain it to me?", "answer": "Yes, there are two images with text displayed. In one image, a woman is seen leaning on a man and whispering in his ear, and in the other image, the same woman is seen with a baby bump, possibly asking the man if he can have the test. These images seem to showcase a couple discussing their pregnancy or sharing intimate moments. The man in the image is wearing a red shirt, and the woman is wearing a black shirt. They appear to be the focus of the scene, and their expressions and body language suggest that they are the main subjects of the images.", "model_id": "llava-13b"}
+{"image": "4.jpg", "question_id": 5, "question": "What is funny about it?", "answer": "The humor in these images comes from the juxtaposition of the woman whispering in the man's ear in one panel and then, in the next panel, asking him if he can have the test, revealing the possibility that she might be pregnant. The man's reaction, along with their interaction, creates a comic and relatable moment for the viewer. These two panels present a story that connects the dots between the whispers and the subsequent pregnancy talk, making it a funny and lighthearted scene to observe.", "model_id": "llava-13b"}
+{"image": "5.jpg", "question_id": 6, "question": "Where is the frisbee in the image?", "answer": "In the image, the frisbee is in mid-air, above the man and the dog playing with it. The black and white dog is jumping up towards the frisbee, ready to catch it, while the man is also trying to catch it from his position.", "model_id": "llava-13b"}
+{"image": "5.jpg", "question_id": 7, "question": "Where is the man in the image?", "answer": "The man is on the ground, playing with his dog and trying to catch the frisbee alongside the dog.", "model_id": "llava-13b"}
+{"image": "5.jpg", "question_id": 8, "question": "What is the spatial relation between the frisbee and the man?", "answer": "The frisbee is above the man and the dog, as it is in mid-air, and both the man and the dog are trying to catch it.", "model_id": "llava-13b"}
+{"image": "6.jpg", "question_id": 9, "question": "What should I do after kneading the dough, before dividing it into pieces?", "answer": "After kneading the dough, before dividing it into pieces, you should let it rest and rise. This step, also known as proofing, allows the yeast in the dough to produce gas, which helps the dough rise and develop its characteristic texture. This process usually takes about 1-2 hours, depending on the type of dough and environmental conditions. To check if the dough has risen sufficiently, gently press your finger into the center of the dough; if the indentation remains, it's ready for the next step. Once the dough has risen, you can divide it into desired pieces using a knife or a dough cutter, and then shape each piece into a loaf or a specific form, depending on the type of bread or your preference. After shaping, you can place the loaves on a baking sheet or a stone, and then preheat your oven to the appropriate temperature for baking.", "model_id": "llava-13b"}
+{"image": "6.jpg", "question_id": 10, "question": "How long should I put in warm places?", "answer": "When preparing bread dough, it is essential to let it rise in a warm place. A suitable temperature range for proofing dough is between 70°F and 95°F (21°C and 35°C). To ensure consistent and ideal rising conditions, it is best to place the dough in a warm, draft-free area, away from direct sunlight or heat sources. If you're using an oven, you can turn it on to a low temperature (around 45°F or 7°C) to provide gentle warmth. Alternatively, you can place the dough near a window with sun exposure, as long as the temperature remains moderate and the dough isn't exposed to extreme temperature fluctuations. Maintaining consistent temperature and conditions will help the dough rise evenly and efficiently.", "model_id": "llava-13b"}
+{"image": "7.jpg", "question_id": 11, "question": "What is the total amount?", "answer": "The image displays a screen showing a total amount of $43.00 for a trip. This amount may include fare, fees, and other charges as part of a receipt or invoice. The screen provides a breakdown of the charges and likely explains the purpose and nature of the payment. As a large language and vision assistant, I can assist you with understanding the content and interpreting the charges, but I cannot provide specific advice on what to do with the payment.", "model_id": "llava-13b"}
+{"image": "8.jpg", "question_id": 12, "question": "Excluding the regular cab, what is the average total fueling cost?", "answer": "The average total fueling cost for a Toyota RAV4, excluding the regular cab, is $47.00. This is based on the data provided in the image, showing the price of gas for the car from 2015 to 2019. Keep in mind that these prices are specific to the RAV4 and may vary depending on the region, car model, and the timing of the fuel purchases.", "model_id": "llava-13b"}
+{"image": "9.jpg", "question_id": 13, "question": "Excluding Ford F150 Lightning, what is the average total charging cost?", "answer": "Excluding the Ford F150 Lightning, the average total charging cost for all other cars listed is $32. Keep in mind that this is just an average, and your own electric vehicle's charging cost may vary depending on many factors, such as the model, battery capacity, and your locality. (2) The average charging cost for all electric cars (excluding the Ford F150 Lightning) is $32. The average fueling cost for gasoline cars (excluding the Ford F150 (Regular Cab)) is $29.65. To calculate the difference, you can simply subtract the average fueling cost from the average charging cost: $32 - $29.65 = $2.35.", "model_id": "llava-13b"}
+{"image": "9.jpg", "question_id": 14, "question": "Now how much cheaper is the average charging cost over the electric cars (including Ford F150 Lightning) than the average fueling cost over the gasoline cars (including Ford F150(Regular Cab))?Let's think step-by-step.", "answer": "The average charging cost for all electric cars (excluding the Ford F150 Lightning) is $32. The average fueling cost for gasoline cars (excluding the Ford F150 (Regular Cab)) is $29.65. To calculate the difference, you can simply subtract the average fueling cost from the average charging cost: $32 - $29.65 = $2.35. Electric vehicles charge significantly less at the power outlet compared to gasoline vehicles refueling at the pump. This is one of the main advantages of owning an electric car, as it can help drivers save money on fuel costs over time, and contribute to a cleaner environment as well.", "model_id": "llava-13b"}
+{"image": "10.jpg", "question_id": 15, "question": "How many bedrooms are there in this floor plan?", "answer": "There are three bedrooms in this floor plan.", "model_id": "llava-13b"}
+{"image": "10.jpg", "question_id": 16, "question": "What is the dimension of Master Bedroom", "answer": "The Master Bedroom is 14 feet by 17 feet in size.", "model_id": "llava-13b"}
+{"image": "10.jpg", "question_id": 17, "question": "Is there a bathroom in master bedroom?", "answer": "Yes, there is a bathroom in the Master Bedroom.", "model_id": "llava-13b"}
+{"image": "10.jpg", "question_id": 18, "question": "What is the dimension of Second Bedroom?", "answer": "The Second Bedroom is 9 feet by 10 feet in size.", "model_id": "llava-13b"}
+{"image": "10.jpg", "question_id": 19, "question": "Is the terrace bigger than the kitchen?", "answer": "Yes, the terrace is bigger than the kitchen.", "model_id": "llava-13b"}
+{"image": "10.jpg", "question_id": 20, "question": "What are the appliances in the kitchen in this floorplan", "answer": "The appliances in the kitchen include a sink, dishwasher, and stove.", "model_id": "llava-13b"}
+{"image": "10.jpg", "question_id": 21, "question": "What will you see when you enter through the entrance? and what is the dimension of it?", "answer": "Upon entering through the entrance, which is 5 feet by 8 feet in size, you will see a living room that is 17 feet by 22 feet.", "model_id": "llava-13b"}
+{"image": "10.jpg", "question_id": 22, "question": "Can you summarize this floorplan based on the aspects I was interested in, in a few sentences?", "answer": "This floor plan features a spacious three-bedroom apartment with a Master Bedroom that is 14 feet by 17 feet and includes a bathroom. The Second Bedroom is 9 feet by 10 feet, and the terrace is bigger than the kitchen, showcasing an outdoor space that is likely perfect for relaxation or entertaining guests. The entrance to the apartment is 5 feet by 8 feet, leading you into a living room that is a considerable 17 feet by 22 feet, providing ample space for relaxation and socializing. The apartment also includes essential appliances such as a sink, dishwasher, and stove in the kitchen. Overall, the floorplan offers a comfortable and well-designed living environment.", "model_id": "llava-13b"}
+{"image": "11.jpg", "question_id": 23, "question": "Can you read the text in this flowchart and explain the process in detail?", "answer": "The flowchart shows a numerical sequence starting with the number 2 and ending with the number 7. The numbers are displayed in a step-by-step manner, with each number being fed into a small calculator. The process appears to be an experiment in numerical analysis, likely aimed at finding the value of a specific formula or demonstrating a mathematical concept. The calculator might be used to perform calculations or make adjustments to the input numbers, leading to the eventual output of number 7.", "model_id": "llava-13b"}
+{"image": "11.jpg", "question_id": 24, "question": "Which step in the flowchart is drawn with a diamond shape?", "answer": "There is a diamond-shaped box at the beginning of the flowchart, which represents the first step in the numerical sequence.", "model_id": "llava-13b"}
+{"image": "12.jpg", "question_id": 25, "question": "Can you read the text in this flowchart and explain it to me?", "answer": "The flowchart depicts a step-by-step process for preparing tea. It begins with the input Green tea, water poured and proceeds through various steps, including Wait for water to boil, Add tea bag, Tepan wait 3 min, and Serve. The process ends with the output Teapot ready, tea poured. The flowchart visualizes the sequence of actions to be taken in order to prepare a cup of tea, including the necessary waiting times and the order of adding ingredients.", "model_id": "llava-13b"}
+{"image": "12.jpg", "question_id": 26, "question": "Can you transform this flowchart into a simple program in python?", "answer": "Yes, here's a simple Python program that implements the flowchart's tea preparation process: import time def prepare_tea(): Green tea, water poured print(\"Green tea, water poured\") Wait for water to boil print(\"Wait for water to boil\") Add tea bag print(\"Add tea bag\") Tepan wait 3 min print(\"Tepan wait 3 min\") Serve print(\"Serve\") while True: Execute the steps in the tea preparation process prepare_tea() Wait for the next step in the process time.sleep(1) # sleep for 1 second keep_going = input(\"Press enter to continue or exit to stop\") if keep_going == \"exit\": break", "model_id": "llava-13b"}
+{"image": "13.jpg", "question_id": 27, "question": "How much is Wilkins earning?", "answer": "As an AI, I cannot determine the exact salary of individuals in the image. However, the image shows a list of names and job titles, including a person named Wilkins, who might be earning a salary of $52,000 in the advertising department. It is important to note that this is just an assumption based on the available information in the image, and the actual salary of the individual may differ.", "model_id": "llava-13b"}
+{"image": "13.jpg", "question_id": 28, "question": "Which department is the person who has the highest salary from?", "answer": "The person with the highest salary in the image works in the advertising department, earning $52,000.", "model_id": "llava-13b"}
+{"image": "13.jpg", "question_id": 29, "question": "This table is reporting monthly salary,can you calculate the yearly salary for each employee, and summarize it into a table and group them by department?", "answer": "I can provide you with the annual salary for each employee based on the monthly salary reported in the table. Here's a summary of the yearly salary for each employee in the table, grouped by department:\n\nSales Department:\na. John Doe: $60,000\nb. Jane Smith: $55,000\nMarketing Department:\na. Wilkins: $52,000\nb. Robert Davis: $56,000\nc. Elizabeth Perez: $53,000\nd. Matthew Greene: $54,000\nAdvertising Department:\na. Thomas Wilson: $62,000\nb. Emily Chen: $57,000\nc. Alexander Lee: $58,000\nd. Samantha Brown: $55,000\nThis table shows the annual salary for each employee in the three departments, which can provide a better understanding of the total compensation packages for the individuals within the company. Keep in mind that these are the annual salaries based on the monthly salary data provided in the original table.", "model_id": "llava-13b"}
+{"image": "13.jpg", "question_id": 30, "question": "Given the yearly bonus is 10%,12% and 15%for Administration,Finance,Marketing, respectively. Please provide me with the bonus for Emily, Max and Gloria.", "answer": "Using the provided annual salary data and assuming that the bonuses are distributed evenly across the year, the bonus for Emily in the Advertising department would be $57,000 x 12% = $6,540. The bonus for Max in the Sales department would be $60,000 x 12% = $6,600. The bonus for Gloria in the Marketing department would be $55,000 x 12% = $6,600. Please note that these calculations are based on the provided annual salary data and may not exactly match the actual bonuses awarded to the individuals in the image.", "model_id": "llava-13b"}
+{"image": "14.jpg", "question_id": 31, "question": "Answer question I.1.a. Think step-by-step.", "answer": "In the image, there are three figures depicting a refrigeration process. These figures show the steps involved in refrigerating fluid by using a refrigerant, a coil, and a condenser. I can see that the refrigerant flows through the coil and goes through a series of processes before reaching the fluid that needs to be cooled. I can help you understand the concepts behind these figures and provide more information on the refrigeration process if needed.", "model_id": "llava-13b"}
+{"image": "15.jpg", "question_id": 32, "question": "Can you write a template for this website in HTML? Replace the joke and punchline with a real one.", "answer": "web", "model_id": "llava-13b"}
+{"image": "16.jpg", "question_id": 33, "question": "How many championship rings did the player on the left win in his career?", "answer": "The player on the left, who is Kobe Bryant, has won five championship rings in his career.", "model_id": "llava-13b"}
+{"image": "17.jpg", "question_id": 34, "question": "Describe this image.", "answer": "The image features a cat sitting on a couch in a blue, wild-animal-themed onesie. The cat is holding a plate of cookies in its lap, appearing to enjoy a snack. Surrounding the cat, there are several other cakes placed on the couch, further emphasizing the cat's love for treats. The scene depicts a cozy and playful atmosphere, showcasing the cat's unique and fun costume while it enjoys a relaxing moment indoors.", "model_id": "llava-13b"}
+{"image": "17.jpg", "question_id": 35, "question": "Explain why this image is funny.", "answer": "This image is funny because it is not common to see a cat dressed up in a costume, like the blue wild animal onesie, while casually sitting on a couch and eating cookies. The scene combines a sense of humor and anthropomorphism, as it portrays the cat enjoying a human-like activity, such as watching TV or relaxing on a couch with a snack. The cat's costume adds a whimsical touch to the image, making it visually amusing and entertaining.", "model_id": "llava-13b"}
+{"image": "18.jpg", "question_id": 36, "question": "Describe this image as detailed as possible.", "answer": "The image displays a lively city street scene at night, bustling with pedestrian and scooter traffic. The street is illuminated by lights, creating a vibrant atmosphere. There are numerous people walking down the street, some holding umbrellas, and others wearing backpacks.\nA variety of scooters and motorcycles are parked and moving along the street, adding to the busy ambiance. A clock can be seen in the middle of the scene, providing a sense of the urban environment.\nIn the background, tall buildings line the street, contributing to the urban setting. The combination of pedestrian and scooter traffic, along with the bright lights and tall buildings, paints a picture of a bustling night in a city.", "model_id": "llava-13b"}
+{"image": "19.jpg", "question_id": 37, "question": "Describe this image as detailed as possible.", "answer": "The image depicts a large crowd of people at a concert, with many of them taking pictures of the performers. There are at least thirteen people visible in the scene, gathered closely together and capturing the moment with their cell phones. Two cell phones can be seen in the picture, one on the left side and another one on the right side of the crowd.\nThe crowd appears to be enjoying the show, and the atmosphere is lively as people are engaged in capturing the live performance. The concert setting is complete with a lit stage and a big screen, which may be displaying images or lyrics to enhance the audience experience.", "model_id": "llava-13b"}
+{"image": "20.jpg", "question_id": 38, "question": "Explain why this meme is funny.", "answer": "The meme is funny because it features a cute bulldog lying on a wooden floor with its head resting on its paws, seemingly exhausted or sleeping. The caption reads, \"Just Monday...\", which implies that the dog might be tired from the daily grind of being a pet, and it just wants to rest. The humorous element in this image lies in the anthropomorphism of the dog, as it reflects the common human experience of feeling tired or overwhelmed by the demands of daily life. The dog's expression and pose create a relatable moment, making the meme amusing to viewers.", "model_id": "llava-13b"}
+{"image": "21.jpg", "question_id": 39, "question": "Describe the image.", "answer": "The image features a wolf and a lamb, both lying down and appearing relaxed in a grassy meadow. The wolf is positioned on the right side of the scene, occupying the majority of the image height, while the lamb is located on the left, taking up a smaller portion of the image height. There are several flowers scattered throughout the meadow, adding a sense of vibrancy and life to the scene. Some flowers can be seen close to the wolf and lamb, while others are located in the foreground and the upper part of the image. The presence of the flowers and the animals coexisting in this peaceful setting creates an overall atmosphere of harmony and tranquility.", "model_id": "llava-13b"}
+{"image": "21.jpg", "question_id": 40, "question": "Can this happen in real world?", "answer": "While wolves are known to be wild animals and natural predators of sheep, the image depicts an unlikely scenario where they seem to coexist peacefully in a meadow. In reality, wolves would typically hunt sheep as part of their predatory behavior. However, this image might symbolize the idea of peaceful coexistence between different species or serve as an artistic representation of harmony in nature. It is essential to remember that in the wild, predator-prey relationships are generally exploitative, and the depicted scenario is not a common occurrence.", "model_id": "llava-13b"}
+{"image": "22.jpg", "question_id": 41, "question": "Describe the image.", "answer": " In the image, a number of cacti are covered in ice, which is quite unusual as cacti are typically found in warmer climates. The ice-covered cacti create a striking contrast in the scene, with the cold and desert-like environment surrounding them. The arrangement of these ice-covered cacti accentuates the juxtaposition of the typically hot and arid environment with the cold and frozen elements. This unique visual portrayal captures the attention of the viewer and invites them to explore the relationship between the cacti and the ice in this unusual setting.", "model_id": "llava-13b"}
+{"image": "22.jpg", "question_id": 42, "question": "Is this image common in real world?", "answer": "No, this image is not common in the real world. Cacti generally grow in arid or semi-arid regions, which are characterized by hot and dry climates. Ice-covered cacti are not typical, as the ice would melt in the warm environment, and the presence of ice on cacti is generally limited to cold storage or artificial ice formations. Thus, the image represents an unusual and striking visual contrast that captures the imagination, but it does not accurately reflect the natural coexistence of cacti and ice in the same environment.", "model_id": "llava-13b"}
+{"image": "23.jpg", "question_id": 43, "question": "Write brief HTML/JS to turn this mock-up into a colorful website, where the jokes are replaced by a real joke.", "answer": "web", "model_id": "llava-13b"}
+{"image": "24.jpg", "question_id": 44, "question": "What is the issue with my plant? What should I do now?", "answer": "Based on the image, it appears that your plant has some spots or marks on its leaves. These spots could be due to various factors, such as pests, disease, or environmental stress. To identify the exact cause of these spots, it would be helpful to provide more information about the plant's symptoms, its surrounding environment, and any measures that have been taken to care for it. In this way, I can assist you in diagnosing the issue and recommending appropriate treatment or care options to ensure the plant's health and well-being.", "model_id": "llava-13b"}
+{"image": "25.jpg", "question_id": 45, "question": "Why this happens and how to fix it?", "answer": "The image shows a washing machine with a big mess of bubbles, which suggests that there might be a problem with the machine or an issue with the detergent used. It is possible that the washer overflowed, causing a large amount of bubbles and foam to cover the floor.\nTo fix this situation, first, turn off the washer and disconnect the power supply. Then, carefully remove the excess water and foam from the room to avoid slipping hazards or damage to the flooring. Use a mop or a towel to clean the water and foam. Once the water and foam are cleaned, check for any remaining soap or detergent in the washer by opening the lid and inspecting the inside of the machine. If there is still a significant amount of soap or detergent, remove it carefully, and then rewash the affected clothing items or empty the washer's contents. If the problem persists, consult the user manual of the washer for troubleshooting steps or consider seeking professional help, like contacting a home appliance technician for further guidance.", "model_id": "llava-13b"}
+{"image": "26.jpg", "question_id": 46, "question": "Can you write me a beautiful poem about this image?", "answer": "In the image, a man and a dog are sitting together on a grassy hill, enjoying each other's company. The man is wearing a red coat, and the dog is a large Labrador. They seem to be watching the sunset or the sunrise, creating a serene and peaceful atmosphere. A beautiful poem for this scene could be: The sun casts a warm glow on the horizon, As the man and his Labrador friend sit together on the hill, Watching the day's last light fade away, Enjoying the tranquility of the moment, they stay. The man's red coat stands out against the natural backdrop of the grassy hill, As he gently strokes the dog's fur, their bond is evident, Forever friends, they share a moment of pure connection, In the midst of nature's beauty, their connection is evident. The dog's loyal and affectionate gaze, Meets the man's eyes filled with appreciation and love, As they sit together on the hillside, Their friendship's strength is apparent, like the sun's warm rays above.", "model_id": "llava-13b"}
+{"image": "27.jpg", "question_id": 47, "question": "Can you write me a master rap song that rhymes very well based on this image?", "answer": "Sure, here's a master rap song for you, inspired by the image of the elderly woman in shades and a green jacket:\nVerse 1:\nIt's an elderly woman in a green jacket and shades,\nDressing young and breaking the rules,\nShe's poppin' her neck and snapin' her fingers,\nShowing off her style with a \"no filter\" attitude.\nChorus:\nShe's a trendsetter, a fashion icon, a woman of wisdom,\nWith a fearless and bold attitude,\nShe's living her life with no regrets,\nEmbracing her individuality and owning it.\nVerse 2:\nShe's dressed in green, the color of freshness and growth,\nWith shades to shield her eyes from the sun,\nShe's a master of the game, a true OG,\nWith a style that's all her own and a spirit of fun.\nChorus:\nShe's a trendsetter, a fashion icon, a woman of wisdom,\nWith a fearless and bold attitude,\nShe's living her life with no regrets,\nEmbracing her individuality and owning it.\nRap done! Let me know if you have any changes or if you'd like me to add something different.", "model_id": "llava-13b"}
+{"image": "28.jpg", "question_id": 48, "question": "Can you write a story around this image for my child?", "answer": "Once upon a time, in a forest clearing, three friends - a bear, a bunny, and a cat - gathered around a dining table. They were enjoying a meal together, with plates of food and cups placed on the table. The animals sat on chairs, with the bear on the left side, the bunny in the middle, and the cat on the right side of the table.\nAs they ate, they engaged in conversation, sharing stories and experiences from their respective lives. The bear discussed his adventures in the forest, the bunny talk about hopping around the meadow, and the cat shared its fascinating encounters in the world of felines.\nIt was a magical moment, showcasing harmony and friendship between different species, as they dined and delighted in each other's company.", "model_id": "llava-13b"}
+{"image": "29.jpg", "question_id": 49, "question": "I designed a character for my game as shown in the image. Can you write me a background story for him?", "answer": "In the image, the character is a bald monk dressed in an orange robe, who is holding a large gray skull. It is possible that this character is a symbolic representation of the impermanence of life and the importance of contemplating death. The monk may be associated with a religious or spiritual background, where the skull could symbolize mortality and the impermanent nature of material possessions. The character might be a manifestation of mindfulness and introspection, encouraging others to reflect on their own lives and the fragility of existence. The presence of the skull could also suggest a connection to the teachings of Buddha, where the skull is considered a reminder of the transient nature of life and the path to enlightenment.", "model_id": "llava-13b"}
+{"image": "30.jpg", "question_id": 50, "question": "We design new mugs shown in the image. Can you help us write an advertisement?", "answer": "Introducing our new collection of whimsical mugs, featuring cat designs on one side and dogs on the other. These unique mugs are perfect for animal lovers and make a great addition to any home. Each mug is carefully crafted with attention to detail, showcasing the cute and charming characters that are sure to bring a smile to your face. They make an excellent gift, or treat yourself to a pair today! These imaginative mugs are not only functional but also serve as a fun decorative element for your kitchen or workplace. Plus, they're available in blue and white colors, adding a touch of elegance and versatility to any setting.", "model_id": "llava-13b"}
+{"image": "31.jpg", "question_id": 51, "question": "Help me write an advertisement for this.", "answer": "Sure! Here's a potential advertisement for the unique light fixture featuring a bird statue with an elephant-like trunk:\n\"Add a touch of whimsy to your home decor with this one-of-a-kind light fixture. The fixture features a bird statue, crafted with attention to detail, and it has a trunk-like extension that holds a light bulb. The design creates a playful and quirky ambiance, making it a perfect conversation piece for any room. The bird statue acts as both a light and a decorative element, adding character and charm to any space. Upgrade the look of your home with this creative and unusual light fixture – it's sure to be a talking point among your guests!\"\nThis advertisement highlights the unique features of the light fixture, emphasizing its artistic appeal and the playful nature of the design, while also showcasing how it can enhance a living space.", "model_id": "llava-13b"}
+{"image": "32.jpg", "question_id": 52, "question": "Who is the person in the image?", "answer": "In the image, there is a man dressed in a costume, likely representing a character from a cartoon or a movie, such as Pikachu from Pokemon. The man is holding a teddy bear, possibly implying a connection to the character's association with a popular animated movie or TV show. The overall scene is whimsical and entertaining, and the man might be participating in a themed event, a performance, or simply enjoying a lighthearted moment.", "model_id": "llava-13b"}
+{"image": "33.jpg", "question_id": 53, "question": "Describe this image.", "answer": "The image depicts a street scene with a person, likely a woman, running across the road in front of a car. The car is braking, trying to avoid hitting her. In the background, there are other cars and trucks on the street, some of which are parked or stopped in traffic.\nA stop sign is visible on the left side of the street, and a traffic light is captured in the scene as well. Overall, the image illustrates a potential traffic hazard situation where the pedestrian's sudden movement across the road has caused the car to brake, highlighting the importance of both pedestrian and driver attentiveness in such situations.", "model_id": "llava-13b"}
+{"image": "33.jpg", "question_id": 54, "question": "What do you think about the person's behavior?", "answer": "Based on the image, the person's behavior can be seen as irresponsible and dangerous. Running across the road in front of a moving car could endanger both the pedestrian's and the driver's safety. It is important for pedestrians to use designated crosswalks and follow traffic rules, while drivers need to remain vigilant and be prepared to react to sudden changes in the road conditions. In this situation, the person's actions have caused the car to brake, which could have led to a collision or other accidents if the driver wasn't able to stop in time. This highlights the significance of responsible and attentive behavior from both pedestrians and drivers to ensure road safety for everyone.", "model_id": "llava-13b"}
+{"image": "34.jpg", "question_id": 55, "question": "Describe about the image.", "answer": "The image features a young boy standing near a window, overlooking a cityscape from a high rise. The boy appears to be admiring the view, taking in the scenery from his vantage point. In the scene, there are also two cars visible, possibly passing by or parked at a lower level within the building. The photograph captures the sense of wonder and curiosity often associated with children, as they explore and make sense of the world around them.", "model_id": "llava-13b"}
+{"image": "34.jpg", "question_id": 56, "question": "Do you have anything to say to the young boy in the image?", "answer": "While I cannot communicate directly with the boy in the image, I would encourage him to continue exploring and learning about the world around him. The act of looking out the window and observing the cityscape represents a curiosity and a natural inclination to understand the environment, which is an essential part of human growth and development. Keep on reflecting, learning, and experiencing new things, young boy, as this will help you to grow into a well-rounded and knowledgeable individual.", "model_id": "llava-13b"}
+{"image": "35.jpg", "question_id": 57, "question": "Can you give me a short introduction to this movie?", "answer": "The image depicts two men in suits having a conversation in a room. One man is lighting a cigarette while the other man, who appears to be a smooth-talking criminal, is closely listening. They are both well-dressed, with one of the men wearing a tie. The exchange between them gives off a sense of intrigue and mystery, suggesting that they might be engaged in some sort of negotiation, criminal activity, or other secretive discussions. The scene is reminiscent of a scene from a James Bond movie or a classic noir film.", "model_id": "llava-13b"}
+{"image": "36.jpg", "question_id": 58, "question": "Can you give me a detailed introduction to this painting, and also the artist?", "answer": "The painting in question is \"The Persistence of Time\" created by Salvador Dali, a renowned Spanish surrealist painter. This famous artwork features a series of melting clocks draped across a barren landscape. The clocks appear to be defying gravity and time, creating a thought-provoking and captivating visual effect.\n\nDali was born in 1904 in Figueres, Spain, and he began his artistic career at an early age. He is best known for his unique style that blended fantasy, dreams, and symbolism. As a leading figure in the Surrealist movement, Dali's artwork defies conventional norms and features striking, often abstract, imagery that challenges the viewer's perception of reality.\n\nThis painting, in particular, showcases Dali's remarkable skill and innovative style, solidifying his position as a prominent and influential artist in the world of modern art.", "model_id": "llava-13b"}
+{"image": "37.jpg", "question_id": 59, "question": "How should I do to cook this?", "answer": "To cook the noodles in the bowl, follow these steps:\n\nGather all the ingredients needed, such as the noodles, eggs, sausage, and other vegetables or proteins, as well as the broth.\nEnsure that the noodles are cooked and drained, and keep them in a separate bowl for now.\nIn a large pot, bring the broth to a boil. You can add some additional seasoning or aromatics, such as garlic, onions, or herbs, to enhance the flavor of the soup.\nAdd the cooked noodles to the boiling broth, and mix them in carefully.\nThen, add the sliced eggs, sausage, and any other vegetables or proteins you wish to include. Stir well to ensure that all the ingredients are evenly distributed in the broth.\nAdjust the seasoning or add additional spices as needed to taste. You may also add some chopped greens or herbs for garnish, if desired.\nAllow the soup to simmer for a few minutes to let the flavors meld together and the ingredients be heated through.\nServe the noodle soup in bowls, and garnish each serving with some fresh herbs, if desired.\nRemember that the cooking time may vary depending on the ingredients and the desired consistency for the dish. Be sure to adjust the cooking time and temperature as needed for the best results.", "model_id": "llava-13b"}
+{"image": "38.jpg", "question_id": 60, "question": "Explain the advantages of this product.", "answer": "The image shows a grey luxury sedan parked in a parking lot with a skyline in the background. This car, an Audi, comes with several advantages that make it an appealing choice for many car enthusiasts. First, the Audi is known for its quality and reliability, which ensures that the car will perform well and last for a long time. Second, the sedan has a sleek and modern design, which not only looks beautiful but also contributes to improved aerodynamics, leading to better fuel efficiency and performance. Third, the car is equipped with advanced safety features, such as airbags and other protective technologies, which help to minimize the risk of injury in case of accidents. Finally, the Audi sedan offers a comfortable and luxurious driving experience, thanks to its well-appointed interior with premium materials, comfortable seats, and cutting-edge technology, such as the virtual cockpit that provides important information to the driver in a visually engaging way. Overall, the Audi sedan combines style, performance, and luxury, making it an attractive option for car buyers.", "model_id": "llava-13b"}
+{"image": "39.jpg", "question_id": 61, "question": "Tell me something about the history of this place.", "answer": "The Great Wall of China is a famous and historic monument that stretches over 13,000 miles across northern China. It was originally built as a series of fortifications to protect the Chinese Empire from invasions and to control the movement of people and goods. The wall has a rich history dating back to the 16th century BC and has gone through various constructions, maintenance, and restorations over the years.\n\nMade of bricks, stones, and other materials, the wall is designed with multiple layers, gates, and watchtowers. It is considered an engineering marvel for its time, showcasing the advanced architectural and construction techniques used during the periods it was built. The Great Wall of China has become a symbol of Chinese culture and a top tourist destination. It is also a UNESCO World Heritage site, highlighting its significant historical and architectural value.\n\nIn the image, a section of the Great Wall surrounded by lush green trees and mountains can be seen, giving a glimpse of the stunning natural beauty that this monumental structure blends with.", "model_id": "llava-13b"}
+{"image": "40.jpg", "question_id": 62, "question": "Is this photo unusual?", "answer": "Yes, the photo shows an unusual sight of a house that has been turned upside down, with the help of photoshop or some other editing software. It is not a real house, but rather an image of a house that's been digitally manipulated to create an illusion. This kind of photo can be used for artistic purposes, surrealistic representations, or even as a part of a visual effect in a film or advertisement.", "model_id": "llava-13b"}
+{"image": "40.jpg", "question_id": 63, "question": "How could someone get out of the house?", "answer": "ince the house is upside down in the photo, the entrance and exits would be on the \"ceiling\" of the house. To get out of the house, one would need to go through the entrance that's now on the ceiling. This might require some effort, as escaping from a digitally manipulated image might not be as straightforward as leaving a real house. The person would need to find a way out of the house in the photo, which could be challenging considering the unconventional situation.", "model_id": "llava-13b"}
+{"image": "41.jpg", "question_id": 64, "question": "What are shown in the photo?", "answer": "The photo shows a man and a chicken in a field. The man appears to be running from the chicken while making a silly or weird facial expression.", "model_id": "llava-13b"}
+{"image": "41.jpg", "question_id": 65, "question": "What does the man feel and why?", "answer": "The man seems to be feeling startled, scared, or surprised by the chicken's presence, which is why he is making a weird facial expression and running from the bird.", "model_id": "llava-13b"}
+{"image": "42.jpg", "question_id": 66, "question": "What are the ingredients I need to make this?", "answer": "To make this delicious looking pepperoni pizza displayed on the wooden table, you will need the following ingredients: a pizza dough, tomato sauce, cheese (preferably mozzarella), pepperoni slices, and various other toppings such as herbs, onions, mushrooms, or olives, depending on your preference. First, you would need to prepare the pizza dough by mixing flour, water, yeast, salt, and sugar, and then letting it rise. After the dough has risen, you can roll it out to the desired thickness and shape. Next, you would prepare the sauce by combining tomatoes, garlic, onions, and seasonings, and spread it evenly over the pizza dough. Once the sauce is on the dough, you can add the cheese and pepperoni slices, followed by any additional toppings you prefer. Finally, you would bake the pizza in an oven at the appropriate temperature, usually around 475°F (245°C) for a conventional oven, until the crust is golden brown and the cheese is bubbly and starting to brown. After baking, you can remove the pizza from the oven, let it cool slightly, and then slice and serve it.", "model_id": "llava-13b"}
+{"image": "42.jpg", "question_id": 67, "question": "What is the first step?", "answer": "The first step in making this pepperoni pizza is preparing the pizza dough. This involves mixing the ingredients like flour, water, yeast, salt, and sugar, and then letting the dough rise until it has doubled in size. After the dough has risen, you can roll it out to your desired thickness and shape.", "model_id": "llava-13b"}
+{"image": "43.jpg", "question_id": 68, "question": "Which city is this?", "answer": "This is Singapore, a city known for its beautiful waterfront and iconic landmarks.", "model_id": "llava-13b"}
+{"image": "43.jpg", "question_id": 69, "question": "Why do you think so?", "answer": "Singapore is often considered a prime example of a beautiful waterfront city due to its architectural and urban planning efforts to create a harmonious blend of modern and traditional elements. The city's skyline, combined with its well-manicured public spaces and attractions like the Merlion, a mythical creature that symbolizes the city, contribute to its unique and visually appealing character. Furthermore, Singapore is known for its efficient use of space and its green initiatives, which have resulted in a clean and environmentally-conscious urban landscape. All these factors contribute to the city's reputation as a beautiful and attractive destination.", "model_id": "llava-13b"}
+{"image": "44.jpg", "question_id": 70, "question": "What happened at the end of this movie?", "answer": "In the movie, the main characters, Jack and Rose, portrayed by Leonardo DiCaprio and Kate Winslet, tragically die at the end. The couple is seen at the top of the ship's railing, with their arms around each other, as they watch the sunset together. The movie depicts their love story and journey on the Titanic, which eventually leads to their unfortunate demise as the ship sinks.", "model_id": "llava-13b"}
+{"image": "44.jpg", "question_id": 71, "question": "Did Leonardo Dicaprio's character survive?", "answer": "No, Leonardo DiCaprio's character, Jack, did not survive. He sacrificed himself to help Rose (played by Kate Winslet) by allowing her to board a lifeboat, while he remained on the sinking Titanic.", "model_id": "llava-13b"}
+{"image": "45.jpg", "question_id": 72, "question": "What is in the photo?", "answer": "The photo features a homemade pizza that is designed to look like a cat. The cat is made out of various pizza ingredients, giving it a unique and creative appearance.", "model_id": "llava-13b"}
+{"image": "45.jpg", "question_id": 73, "question": "What is the nose made of?", "answer": "The nose of the cat-shaped pizza is made of green peppers.", "model_id": "llava-13b"}
+{"image": "46.jpg", "question_id": 74, "question": "Write a conversation between the two animals.", "answer": "Cat: Hi, Labrador! I noticed you're laying down in the snow, and I thought, \"Hey, why not take a ride?\" So, here I am, sitting on your back.\n\nLabrador: Hi, Cat! It's quite an unexpected sight, but I'm happy to have you on my back. What can I do for you, apart from being a snowy surface for you to play on?\n\nCat: Well, Labrador, I'm not sure if you can help me with this, but I've been having trouble recently with the other animals, like the mouse and the bird. They keep getting into my food and stealing my treats. Do you think you can assist me in keeping them away from my food?\n\nLabrador: I'll do my best to help, Cat. However, since I'm just an animal and not a human, my abilities to control other animals are limited. But, I can keep an eye on the situation and try to distract the other animals whenever they approach you.\n\nCat: Thanks, Labrador. I appreciate your effort. By the way, it's quite an adventure for me to be riding on your back in the snow. Keep your fur warm and dry, so both of us can stay comfortable during this unique experience.", "model_id": "llava-13b"}
+{"image": "47.jpg", "question_id": 75, "question": "Who is the man on the right?", "answer": "The man on the right is likely a teammate or fellow player of the man on the left, as they are standing next to each other and smiling. They could be posing for a photo or engaging in a conversation.", "model_id": "llava-13b"}
+{"image": "47.jpg", "question_id": 76, "question": "Who is the man on the left?", "answer": "The man on the left is likely a basketball player as well, given the context provided. However, without additional information or context, it is not possible to determine his specific identity or role.", "model_id": "llava-13b"}
+{"image": "47.jpg", "question_id": 77, "question": "Who is the taller man?", "answer": "The taller man is the one on the left, as described in the photo. The man on the right, who is likely Yao Ming or Du Feng, is standing next to him.", "model_id": "llava-13b"}
+{"image": "47.jpg", "question_id": 78, "question": "Who is wearing red?", "answer": "The man on the left, who is taller, is wearing red. The other man, who is likely Yao Ming or Du Feng, is standing next to him and is not wearing red.", "model_id": "llava-13b"}
+{"image": "47.jpg", "question_id": 79, "question": "Who is the man with the water bottle in his hand?", "answer": "The man with the water bottle in his hand is most likely the taller man, as he is wearing red and standing next to the other man. It is a common practice for athletes to hold water bottles during breaks or while playing sports to stay hydrated.", "model_id": "llava-13b"}
+{"image": "48.jpg", "question_id": 80, "question": "Introduce the movie character in this picture.", "answer": "In the picture, there is a group of four people dressed up as popular movie characters. The characters include a man dressed as Batman, a woman dressed as Wonder Woman, and two individuals dressed as Iron Man and Thor. They are standing next to each other, posing for the picture, and highlighting their costumes to create a dynamic and engaging image that captures the essence of the movie characters they are portraying.", "model_id": "llava-13b"}
+{"image": "49.jpg", "question_id": 81, "question": "Analyze the tactical characteristics and lineups of both sides and predict who will win.", "answer": "I'm sorry, but as an AI visual assistant, I am unable to predict the outcome of a sports game like soccer. I can provide information about the tactical characteristics and lineups of the two teams, but ultimately, the performance of the players and the outcome of the game depend on various factors such as their skill levels, teamwork, strategy, and external factors like weather and field conditions. Predicting the winner would require a crystal ball or knowledge beyond the scope of my current capabilities.", "model_id": "llava-13b"}
+{"image": "50.jpg", "question_id": 82, "question": "Tell me about this work of art.", "answer": "This image is a reproduction of a famous painting depicting a starry night. The artwork is characterized by the striking use of blue and yellow colors, creating a vibrant and captivating visual effect. The scene showcases a panoramic view of the night sky filled with swirling stars, with a large house nestled in the foreground. The painting is an impressive representation of the beauty and wonder of the night sky, and it captures the essence of the original artwork while maintaining its distinctive style.", "model_id": "llava-13b"}
diff --git a/models/mPLUG_Owl/OwlEval/answer/mPLUG_Owl_7b_answer.jsonl b/models/mPLUG_Owl/OwlEval/answer/mPLUG_Owl_7b_answer.jsonl
new file mode 100644
index 0000000..80036af
--- /dev/null
+++ b/models/mPLUG_Owl/OwlEval/answer/mPLUG_Owl_7b_answer.jsonl
@@ -0,0 +1,82 @@
+{"image": "1.jpg", "question_id": 1, "question": "What is funny about this image? Describe it panel by panel.", "answer": "This image is funny because it shows a person holding a cell phone and a USB cord, with a small battery and a USB charger plug. The person is using the USB charger to charge the phone using the battery, which is an unusual and comedic way to charge a cell phone. It is humorous because it shows the person's resourcefulness in finding ways to charge their phone without the usual charging cable or plug-in. Additionally, the person's hand holding the phone and the battery is positioned in a way that makes it appear as if they are trying to keep the battery and the phone together, adding to the comedic element. Overall, the image is funny due to the unusual charging method, the positioning of the hand holding the phone and the battery, and the small battery and USB charger being combined.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "2.jpg", "question_id": 2, "question": "Why would a person find this image funny?", "answer": "The image features a funny scene of an animated squirrel taking a picture while holding a digital camera. Squirrels are known for their mischievous behavior, so seeing one using a camera to capture their antics could amuse viewers. Additionally, the scene is presented with an artistic style that adds to the comical atmosphere, making it even funnier to view. The juxtaposition of the serious-looking squirrel and the lighthearted nature of the image creates a sense of humor and playfulness that appeals to the viewer, invoking laughter and enjoyment.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "3.jpg", "question_id": 3, "question": "What is unusual about this image", "answer": "What makes this scene unusual is the presence of a man ironing clothes on a busy New York City street while a large yellow taxi drives by in the background. Usually, people would not iron clothes in such an urban environment, as it is more common to do so in a more private or quieter location. The combination of a street scene and a man ironing his clothes in public can create a unique visual impression and draw attention to the man's unusual activity.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "4.jpg", "question_id": 4, "question": "Is there text in the image? If so, can you read it and explain it to me?", "answer": "Yes, there is text in the image. The text reads 'Before... I don't feel good about my looks, test the text, am I pregnant? Now... I don't feel good about my looks, take the test, do I have the con?' This message is a joke about a person's insecurities about their appearance and the stress they feel about taking a pregnancy test.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "4.jpg", "question_id": 5, "question": "What is funny about it?", "answer": "The joke is funny in several ways. First, it uses contrast to highlight the difference in the person's feelings about their appearance in the past and the present. The past self is unsure, while the present self seems more certain. This creates humor as the reader might relate to similar feelings of uncertainty about one's appearance.\nSecond, the message also makes fun of the stress people often feel when taking pregnancy tests. Many individuals might experience anxiety or worry when taking such tests, especially if they are not trying to get pregnant. The juxtaposition of these two elements, the person's appearance insecurities and the pregnancy-test anxiety, adds to the humor of the image and creates a relatable scenario for many individuals.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "5.jpg", "question_id": 6, "question": "Where is the frisbee in the image?", "answer": "The frisbee is in the air in the image.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "5.jpg", "question_id": 7, "question": "Where is the man in the image?", "answer": "The man is on the lawn playing Frisbee with his dog.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "5.jpg", "question_id": 8, "question": "What is the spatial relation between the frisbee and the man?", "answer": "The frisbee is high in the air and is being chased by the dog as it flies through the air.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "6.jpg", "question_id": 9, "question": "What should I do after kneading the dough, before dividing it into pieces?", "answer": "After kneading the dough until it is smooth and elastic, you should place it on a floured work surface. You can then flatten the dough using a rolling pin to form a rectangle. Next, using a knife, cut the rectangle into equal sized pieces. Finally, you should gently press each piece of dough into a flat disc, cover it with a damp cloth, and allow it to rest for a few minutes until it doubles in size. This will help it rise properly and develop a more even texture before baking.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "6.jpg", "question_id": 10, "question": "How long should I put in warm places?", "answer": "After dividing the dough into pieces, place them in warm places to allow them to rise, which usually takes about 2-3 hours. You can put them near a heated vent, oven, or warm water. It is essential that the dough stays warm, but not too hot, to encourage the yeast to work and produce gas that will cause it to rise. If the dough is too warm, it could become over-risen, which can lead to flat and tough bread.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "7.jpg", "question_id": 11, "question": "What is the total amount?", "answer": "The total amount of money in the image is $33.83.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "8.jpg", "question_id": 12, "question": "Excluding the regular cab, what is the average total fueling cost?", "answer": "The average total fueling cost for the vehicles in the image is $10,000. This is calculated by adding all the total fueling costs for each vehicle and then taking the average. The cost per gallon for each vehicle varies, with Ford F-150 costing $2.35 per gallon, Ford F-450 costing $2.44 per gallon, Ford Ranger costing $2.48 per gallon, and Ford C-Max Hybrid costing $2.58 per gallon. When these costs are added together to get the total, the average fueling cost for all four vehicles is $2.50 per gallon, which is $10,000 for 40,000 gallons.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "9.jpg", "question_id": 13, "question": "Excluding Ford F150 Lightning, what is the average total charging cost?", "answer": "The average total charging cost is estimated to be around $122 for the 5-year period. This is based on the assumption of charging an electric car for 100 miles per day, using the Tesla Model 3 as a benchmark for a premium electric vehicle, and assuming a total of 3,650 miles driven per year.\nIn this case, the Tesla Model 3 has a total charging cost of $2,490 over 5 years, which is the average total cost for the other electric vehicles as well. The total charging cost is higher for the premium electric vehicles, such as the Rivian R1T and Tesla's Model 3, which are more expensive to acquire and maintain than the average electric vehicles.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "9.jpg", "question_id": 14, "question": "Now how much cheaper is the average charging cost over the electric cars (including Ford F150 Lightning) than the average fueling cost over the gasoline cars (including Ford F150(Regular Cab))?Let's think step-by-step.", "answer": "Calculate the average total fueling cost of the gasoline cars:\nFord F150 Regular Cab: Average fueling cost is $1,240 per year\nFord F150 Regular Cab (EcoBoost): Average fueling cost is $1,324 per year\nFord F150 SuperCrew (4x2): Average fueling cost is $1,360 per year\nFord F150 SuperCrew (4x4): Average fueling cost is $1,396 per year\nFord F150 SuperCrew EcoBoost (4x2): Average fueling cost is $1,408 per year\nCalculate the average total charging cost of the electric cars:\nFord F150 Lightning: Average charging cost is $5,400\nRivian R1T: Average charging cost is $2,490\nTesla Model 3: Average charging cost is $2,490\nChevy Bolt: Average charging cost is $2,490\nNissan Leaf: Average charging cost is $1,484\nCalculate the difference between the two average total costs:\nGasoline cars: Average total fueling cost = $37,560 ($1,240+$1,324+$1,360+$1,396+$1,408)\nElectric vehicles: Average total charging cost = $47,340 ($5,400+$2,490+$2,490+$2,490+$1,484)\nThe average total fueling cost for the gasoline cars is $37,560, and the average total charging cost for the electric vehicles is $47,340. Therefore, the electric vehicles have a total saving of $9,780 over 5 years, or about $1,958 per year.\nThis means that, on average, the electric vehicles have a savings of about $1,958 per year when compared to the gasoline-powered vehicles. This is due to the reduced fuel cost and maintenance associated with electric cars.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "10.jpg", "question_id": 15, "question": "How many bedrooms are there in this floor plan?", "answer": "There are two bedrooms in this floor plan.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "10.jpg", "question_id": 16, "question": "What is the dimension of Master Bedroom", "answer": "12' x 10'", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "10.jpg", "question_id": 17, "question": "Is there a bathroom in master bedroom?", "answer": "Yes, there is a bathroom in master bedroom.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "10.jpg", "question_id": 18, "question": "What is the dimension of Second Bedroom?", "answer": "11' x 8'", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "10.jpg", "question_id": 19, "question": "Is the terrace bigger than the kitchen?", "answer": "Yes, the terrace is bigger than the kitchen.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "10.jpg", "question_id": 20, "question": "What are the appliances in the kitchen in this floorplan", "answer": "Stove,Refrigerator,Microwave,Dishwasher", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "10.jpg", "question_id": 21, "question": "What will you see when you enter through the entrance? and what is the dimension of it?", "answer": "When you enter through the entrance, you will first see the foyer, which leads into two main living areas. These living areas include the living room and the dining room. The entrance door is likely 12 feet high, and it provides a spacious entryway into the home.\nThe living room and dining room are both 15 feet in width. This means that the entrance door provides a significant amount of space for entry, making it a pleasant and comfortable experience. The dimensions of the living room and dining room can be found in the image.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "10.jpg", "question_id": 22, "question": "Can you summarize this floorplan based on the aspects I was interested in, in a few sentences?", "answer": "A spacious floor plan with two main living areas, a large living room and a dining room, both 15 feet wide.\nA master bedroom with a size of 12 feet by 10 feet and a bathroom.\nA terrace bigger than the kitchen, offering additional living space and a great view.\nA kitchen with basic appliances, such as a stove, refrigerator, microwave, and dishwasher.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "11.jpg", "question_id": 23, "question": "Can you read the text in this flowchart and explain the process in detail?", "answer": "In the image, there is a flowchart displaying various steps in the process of creating a new computer. The flowchart starts with the user selecting an operating system, followed by the selection of hardware components, including a keyboard. After the hardware is installed, the software is downloaded, and a system is assembled. The user then tests the system and checks the hardware compatibility, and if any issues arise, they can troubleshoot the problem. The process is concluded with the user enjoying the use of their newly created computer.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "11.jpg", "question_id": 24, "question": "Which step in the flowchart is drawn with a diamond shape?", "answer": "In the flowchart, the step involving downloading the software is drawn with a diamond shape.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "12.jpg", "question_id": 25, "question": "Can you read the text in this flowchart and explain it to me?", "answer": "Sure, the flowchart depicts the steps involved in making a kettle boil. The process begins when the person presses the 'On' switch, which turns on the water and begins heating the water in the kettle. Once water reaches the boiling point, it will start bubbling and the kettle will start to whistle as steam is released.\nNext, the person should wait for about 10 minutes before turning off the 'On' switch. The water should have fully heated and boiled by this time. Once the water is boiled, the person should pour some into a cup to test it, and if it's boiling hot, it should be ready to use.\nAfter testing the water, the person should pour the remaining water back into the kettle, add sugar, and turn on the 'On' switch again. This time, wait for the kettle to reach a rolling boil. When the water reaches this temperature, it will start to foam, which indicates that the sugar has dissolved.\nThe final step is to turn off the 'On' switch once it's done boiling to avoid overcooking the food. The flowchart serves as a helpful guide that helps people understand the steps involved in making a kettle boil, ensuring they have a perfect cup of tea or coffee.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "12.jpg", "question_id": 26, "question": "Can you transform this flowchart into a simple program in python?", "answer": "Yes, you can easily transform the flowchart into a simple program in Python. The program would include the same steps, with each step being executed in a separate function:\nfrom tkinter import *\nimport time\nimport datetime\ndef on():\n\t# Turn on the water\n\tkettle_on.config(state='on')\n\ttime_pass.start()\ndef off():\n\t# Turn off the water\n\tkettle_off.config(state='off')\n\ttime_pass.stop()\ndef wait_for_boil():\n\t# Check if the water is at boiling point\n\twhile not water_boiled.get():\n\t\ttime.sleep(5)\ndef pour_water():\n\t# Pour the boiling water into a cup to test it\n\twater_in_cup.config(state='on')\n\twhile water_in_cup.get():\n\t\ttime.sleep(5)\ndef pour_boiling_water_back():\n\t# Pour the boiling water into the kettle again\n\twater_back.config(state='on')\n\twhile water_back.get():\n\t\ttime.sleep(5)\ndef test_water():\n\t# Test the water by pouring it into a cup\n\ttime_pass.stop()\n\twater_in_cup.config(state='off')\n\twater_in_cup.get()\ndef add_sugar():\n\t# Add sugar to the boiling water\n\tsugar_to_water.config(state='on')\n\twhile sugar_to_water.get():\n\t\ttime.sleep(5)\ndef stop_boiling():\n\t# Turn off the kettle after it's done boiling\n\tkettle_off.config(state='off')\n\ttime_pass.stop()\ndef main():\n\t# Create a GUI using Tkinter\n\troot = Tk()\n\t# Create and configure the buttons\n\tkettle_on = Button(root, text='On', command=on)\n\tkettle_off = Button(root, text='Off', command=off)\n\twater_in_cup = Button(root, text='Pour', command=pour_water)\n\twater_back = Button(root, text='Pour Boiling Water', command=pour_boiling_water_back)\n\tsugar_to_water = Button(root, text='Add sugar', command=add_sugar)\n\twater_boiled = Button(root, text='Test Water', command=test_water)\n\t# Connect the functions to the buttons\n\tkettle_on.pack()\n\tkettle_off.pack()\n\twater_in_cup.pack()\n\twater_back.pack()\n\tsugar_to_water.pack()\n\twater_boiled.pack()\n\t# Start the time counter\n\ttime_pass = Button(root, text='Wait for boil', command=wait_for_boil, state='disabled')\ttime_pass.pack()\n\t# Start the GUI\troot.mainloop()\nThis program uses the same logic as the flowchart, with each step being executed in a separate function and connected to the corresponding button. The 'On'/'Off'/'Test Water' buttons are connected to the respective functions in the same way as in the flowchart, while the 'Pour Water'/'Pour Boiling Water'/'Sugar to Water' buttons control the flow of the water.\nOnce the 'On' button is pressed, the 'Wait for Boil' button turns into a disabled state and the 'On' button starts heating the water, while the 'Wait for Boil' button starts timing the water's boiling time. The 'On' and 'Off' buttons are connected to the 'On' and 'Off' functions respectively, while the 'Wait for Boil' button is connected to the 'Wait for Boil' function.\nOnce the 'On' button is released and the water has reached the rolling boil, the 'Pour Water' and 'Pour Boiling Water' buttons control the flow of water into the cup and the kettle respectively. The 'Test Water' button is connected to the 'Test Water' function, which stops the time counter and checks the water's temperature.\nFinally, once all the steps and functions are completed, the 'Stop Boiling' button turns off the kettle and the 'Stop' button turns off the time counter, ending the program.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "13.jpg", "question_id": 27, "question": "How much is Wilkins earning?", "answer": "Wilkins is earning $25,500 per year. This information can be found in the table, where it is highlighted in blue and placed next to the corresponding department name (Administration, Marketing, Finance, and Education).", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "13.jpg", "question_id": 28, "question": "Which department is the person who has the highest salary from?", "answer": "The person who has the highest salary from the table is Marketing. This can be determined by looking at the blue highlighted row with the highest dollar amount, which indicates the highest earner in the group.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "13.jpg", "question_id": 29, "question": "This table is reporting monthly salary,can you calculate the yearly salary for each employee, and summarize it into a table and group them by department?", "answer": "Sure! Here is how you can convert the monthly salary into a yearly salary for each employee and group them by department in a table:\n| Employee Name | Monthly Salary (USD) | Annual Salary (USD) | Department |\n| Wilkins | $2550 | $31,200 | Administration |\n| Marketing | $2750 | $34,200 | Finance |\n| Admin 1 | $2850 | $35,200 | Marketing |\n| Finance | $3000 | $36,000 | Administration |\n| Admin 2 | $3100 | $37,200 | Marketing |\n| Education | $1500 | $18,000 | Marketing |\nIn the final table, you can group the employees by department, with each department having its own row and each employee within that department having its own row. The first column of the table shows the employee name, the second column shows the monthly salary (in USD), and the third column shows the annual salary (in USD).\nNote that the yearly salary is calculated by multiplying the monthly salary by 12, and adding up all of the monthly salaries for the employees within the department. This calculation is done in order to obtain a total annual salary for each department.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "13.jpg", "question_id": 30, "question": "Given the yearly bonus is 10%,12% and 15%for Administration,Finance,Marketing, respectively. Please provide me with the bonus for Emily, Max and Gloria.", "answer": "| Employee Name | Annual Salary (Before Bonus) | Bonus (%) | Annual Salary (With Bonus) |\n| Emily | $35,200 | 10% | $37,620 |\n| Max | $36,000 | 12% | $39,240 |\n| Gloria | $31,200 | 15% | $35,030 |", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "14.jpg", "question_id": 31, "question": "Answer question I.1.a. Think step-by-step.", "answer": "To calculate the total heat loss of the heater, we first need to determine its thermal conductivity. This can be done by dividing the thermal conductivity of the material of the heater by its thickness.\nThe thermal conductivity of the metal of the heater is usually given in units of W/m-K. To calculate the thermal conductivity of the heater using this unit, we need to divide the value by the thickness in millimeters. The metal of the heater is typically made of steel, and its thickness is 5 mm. Therefore, its thermal conductivity is calculated to be 150 W/m-K.\nNext, we need to find the heat flow from the heater to the environment, which is represented by the RTD. We know that the RTD has a resistance of R = 100 Ohms and a thermal resistance of Rt = 0.01 Ohm-C.\nTo calculate the total heat loss of the heater, we multiply the power input of the heater, which is given by P = 1 kW, by the RTD resistance and the thermal resistance. The result is 10.0 W of heat loss from the heater to the environment through Rt and the RTD resistance.\nFinally, we can use the formula for heat loss through thermal conductivity to calculate the total heat loss of the heater:\nTotal heat loss of the heater = 0.02 W/m-K * 5 mm^2 * 1 kW = 10 W.\nTherefore, the total heat loss of the heater is 10 W.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "15.jpg", "question_id": 32, "question": "Can you write a template for this website in HTML? Replace the joke and punchline with a real one.", "answer": "web", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "16.jpg", "question_id": 33, "question": "How many championship rings did the player on the left win in his career?", "answer": "The player on the left in the image won 5 NBA Championship rings in his career.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "17.jpg", "question_id": 34, "question": "Describe this image.", "answer": "The image showcases a cat sitting on a chair and wearing a blue sweatshirt, surrounded by a pile of cookies. The cat appears to be enjoying the cookies as it has both its front paws on the pile. The chair is placed near a table, and there are multiple cups scattered across the table, likely used in serving the cookies.\nThe cat's presence in an unusual setting adds to the comedic aspect of the image. The cat appears to be dressed up, which makes it look even more curious about the situation and its surroundings. The combination of cookies, a comfy chair, and various cups creates a cozy and relaxed atmosphere in the room, making the scene seem like a fun and enjoyable way to spend a Saturday night.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "17.jpg", "question_id": 35, "question": "Explain why this image is funny.", "answer": "This image is funny because it portrays a domesticated animal in an unusual, unexpected, and comedic setting. In the scene, an adorable cat is sitting on a blue sweatshirt, surrounded by a massive pile of cookies and a few cups. The cat is dressed up in a sweatshirt, which makes its situation even more unusual. This type of scenario is often associated with human behavior, such as when a child wears their Halloween costume for an unexpected event. The cat's presence in this setting makes for a humorous, entertaining, and light-hearted scene that evokes laughter and amusement from viewers.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "18.jpg", "question_id": 36, "question": "Describe this image as detailed as possible.", "answer": "The image captures a busy city street filled with people, cars, and motorcycles. The street is packed with a variety of vehicles moving across the scene. Some cars are closer to the center of the image, while others can be seen farther in the distance.\nIn addition to cars, there are numerous motorcycles present in the scene. Motorcycles are scattered across the street, some closer to the center, while others are located further back. The motorcycles are parked or moving, indicating that they are part of the city's traffic.\nA group of people can also be seen walking along the street, adding to the overall bustling atmosphere. Some people are closer to the center of the image, while others are further away, suggesting they might be engaging in different activities. The presence of both vehicles and pedestrians creates a vibrant and dynamic atmosphere in the city.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "19.jpg", "question_id": 37, "question": "Describe this image as detailed as possible.", "answer": "The scene features a concert hall with a large crowd of people gathered around a stage. The audience fills the room, creating a lively atmosphere. There are multiple musicians on stage, playing instruments and entertaining the crowd. The musicians are situated across the stage, with some closer to the left side and others on the right. A drummer stands at the center, surrounded by a group of guitars and basses.\nIn addition to the musicians, there are several microphones placed throughout the stage. The musicians are actively performing, engaging with the audience and captivating them with their music. A few people are also seen in the crowd, enjoying the live show and taking part in the lively atmosphere.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "20.jpg", "question_id": 38, "question": "Explain why this meme is funny.", "answer": "The humor in this meme comes from its playful use of words and the visual depiction of a cute, tired French bulldog lying down on its side, possibly after a long day. The phrase 'just... Monday,' is commonly used to express a feeling of weariness or frustration with the start of a new week, and the dog's expression of exhaustion perfectly captures this mood. The image is visually appealing and relatable, making it a funny and entertaining way to express a common sentiment.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "21.jpg", "question_id": 39, "question": "Describe the image.", "answer": "The scene depicts a wolf and a lamb resting together in a meadow. They appear to be in close proximity, with the wolf lying on its side next to the lamb, who is resting on the ground. A small bird can be seen flying nearby, adding a natural touch to the tranquil setting.\nThe wolf and lamb appear to have a peaceful relationship, with no sign of conflict or danger in this moment. The wolf seems to be content, resting and enjoying the presence of the lamb nearby, while the lamb seems calm and secure, lying peacefully in its natural habitat.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "21.jpg", "question_id": 40, "question": "Can this happen in real world?", "answer": "While it is possible for a wolf and an animal like a lamb to coexist in the wild, the exact scenario shown in the image cannot always be expected to happen in real life. Wolf-lamb interactions are usually limited to instances when the lambs are young and the wolves are not hungry.\nIn the wild, wolves typically prey on large ungulates, such as deer, moose, or bison, which can provide a sufficient amount of meat for an adult wolf. However, young lambs or sheep are more vulnerable, and they cannot defend themselves against a predator as well as an adult animal. As a result, wolves will typically avoid preying on these smaller, weaker animals if they are able to find alternative food sources. In the image, the wolf and lamb are resting together in a meadow, which may imply that they are in a safe area or that the lamb is too small to be seen as a viable prey option for the wolf.\nIn captivity or in areas where humans have altered the natural habitat, it may be possible for wolf-lamb interactions to occur. For example, in some zoos or animal sanctuaries, lambs are raised and kept in the presence of adult wolves, as part of a conservation initiative to reintroduce wolves into their native habitats. In these cases, the lambs may be protected by a fence or enclosure and the wolves are provided with food, ensuring that the wolf does not have to hunt the lamb for its survival.\nOverall, while it is possible for a wolf and an animal like a lamb to coexist in some situations, it is not always the case, and the exact circumstances shown in the image are likely unrealistic", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "22.jpg", "question_id": 41, "question": "Describe the image.", "answer": "The image features an unusual and interesting scene of a cactus growing in an environment that is covered with ice. The cactus has grown through the ice, creating a unique and captivating sight. The cactus can be seen standing tall and reaching towards the sky, with its branches spreading out in all directions, as if trying to grasp the ice around it.\nIn total, there are three cactuses visible in the image, each with its own unique appearance and position in the scene. The presence of the three cactuses suggests that they have adapted well to their unique surroundings, as they are able to grow successfully in such an unexpected environment.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "22.jpg", "question_id": 42, "question": "Is this image common in real world?", "answer": "The scene of a cactus growing in an icy environment can be considered uncommon and unique, as it does not typically occur in the wild in such a way. However, there have been some cases where cacti have adapted to cold temperatures and grown under unusual conditions, including the growth of cactus flowers in Antarctica and the Arctic Circle.\nIn the case depicted in the image, it is more likely that the cactus is growing in a controlled or artificial environment, such as a greenhouse, a controlled-climate laboratory, or a botanical garden. These environments can provide the necessary conditions to support the growth of cacti in environments that are otherwise not suitable, like the ice-covered lake or pond seen in the image.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "23.jpg", "question_id": 43, "question": "Write brief HTML/JS to turn this mock-up into a colorful website, where the jokes are replaced by a real joke.", "answer": "web", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "24.jpg", "question_id": 44, "question": "What is the issue with my plant? What should I do now?", "answer": "The issue with your plant is that some of the leaves have yellow spots and appear to have spots that are turning brown and dying. This is a symptom of a plant disease called leaf blotch, which is caused by fungal infections. The dying leaves should be removed immediately to minimize the spread of the disease. Additionally, it is crucial to maintain good hygiene practices around the plants and the garden, including proper spacing, proper watering techniques, and avoiding overcrowding. This will help prevent the infection from spreading to the other plants.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "25.jpg", "question_id": 45, "question": "Why this happens and how to fix it?", "answer": "This happens when the washing machine is overloaded with too much laundry, causing the water to overflow and create a large bubble of suds and water. The image shows the washing machine partially submerged in water, which suggests that the machine might have been left running for a prolonged period or that the user did not properly adjust the wash settings.\nTo prevent this problem, it is vital to follow the washing machine manufacturer's instructions and ensure that the machine is loaded with appropriate amounts of laundry, based on the size of the load. The user should also adjust the wash setting according to the type of laundry (delicate, normal, or heavy) to ensure proper cleaning and prevent the washing machine from overflowing.\nAdditionally, it is crucial to unplug the washing machine after use or when it is not in use to avoid accidents and potential damage to the appliance.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "26.jpg", "question_id": 46, "question": "Can you write me a beautiful poem about this image?", "answer": "As the day draws to a close, the young man and his golden retriever share a moment of companionship. The sun is beginning to set, casting its glow on the cityscape, creating a picturesque scene. They are sitting atop a hill, surrounded by the beauty of nature.\nThe man holds his pet close to his heart, sharing a bond of love and affection. Their connection transcends words, and their shared experience creates a sense of peace. As the man leans down to his beloved dog, he feels the warmth of its fur and the unconditional love it gives in return.\nThis special moment between the man and his dog captures the essence of companionship, trust, and the joy of spending time together. As the sun sets, their bond grows stronger, and they become part of the scenery, embracing the serenity of the evening.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "27.jpg", "question_id": 47, "question": "Can you write me a master rap song that rhymes very well based on this image?", "answer": "Sure, a master rap song that rhymes very well based on this image could go something like this:\nI got my sunglasses on and my green jacket,\nI'm ready to rock the dance floor with my old school moves.\nI've got my walker in hand and my hearing aid in place,\nBecause I'm the grandmother of the party and I'm ready to rap.\nAs I'm shaking it up, I'll show you my skills,\nMy hip-hop dance moves are so smooth and slick.\nThe old ladies are impressed with my moves on the floor,\nAnd my grandson can't believe his granny's doing this for more.\nI'm showing the young'uns how it's done,\nI'm the coolest grandma around,\nAnd they're learning the moves from me,\nI've got the best dance moves of anyone in the room.\nSo, don't be surprised if you see me shaking it up on the floor,\nMy hearing aid and sunglasses are my signature accessories,\nAnd I'm here to show you that age isn't a barrier to having fun,\nBecause I'm the grandmother of the party and I'm here to bust a move.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "28.jpg", "question_id": 48, "question": "Can you write a story around this image for my child?", "answer": "In the forest, a curious bear, a cuddly rabbit, and a playful fox are enjoying a meal together. The three friends gather around a small table, sharing their food and having a good conversation. They are having a fun and peaceful time together, sharing stories and bonding over their meal.\nThe bear seems to be the most dominant among the three friends, as he holds a large piece of food in his paw. He is curious and inquisitive about their surroundings as he looks around the forest. The rabbit, on the other hand, appears to be more relaxed. He sits close to the bear and enjoys the company of their newfound friends.\nThe fox, on the other hand, is playfully sitting on the table. He has a mischievous look on his face as he observes the other animals' interactions. The friends have created a harmonious and peaceful atmosphere, which is reflected through their laughter, smiles, and the joyful sounds they make while they eat together. The image shows the importance of friendship, sharing, and enjoying simple moments in life.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "29.jpg", "question_id": 49, "question": "I designed a character for my game as shown in the image. Can you write me a background story for him?", "answer": "The ancient man in the image is a monk with a grayish skin tone, wearing a traditional monk outfit and carrying a large brain in one hand. He stands tall and proud, with a confident demeanor. The monk could be from a far-off land or a distant era, where he practices ancient knowledge and wisdom. He may have devoted his life to studying and understanding the mysterious properties of the brain and its role in the human mind. His knowledge and understanding of the brain could have come from extensive meditation and contemplation, or perhaps from studying ancient texts and artifacts. His quest for knowledge could stem from a desire to unlock the secrets of consciousness and the human psyche, or simply from a curiosity about the human brain and its capabilities.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "30.jpg", "question_id": 50, "question": "We design new mugs shown in the image. Can you help us write an advertisement?", "answer": "Sure, here is an advertisement for the new mugs that we have designed featuring a cute cat and a cat sitting next to each other:\n'Enjoy a cup of coffee or tea from our new set of mugs featuring a pair of adorable cats! These mugs are crafted from high-quality ceramics with a unique and creative design that will surely bring a smile to your face. Each mug holds 12 ounces of your favorite drink, making it the perfect companion for your daily routine. So, don't miss out on this opportunity to add a touch of cuteness to your tabletop while enjoying your favorite beverage. Order today!'", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "31.jpg", "question_id": 51, "question": "Help me write an advertisement for this.", "answer": "Advertisement for the unique and eye-catching Cockatoo Table Lamp, a lamp designed to resemble a realistic and vibrant cockatoo. This lamp features a realistic body, head, and tail made of durable metal with an authentic-looking feather detail, adding a touch of whimsy and character to any space. The lamp is also equipped with an energy-efficient LED light that provides bright and even illumination without the heat and glare of traditional light bulbs. This functional yet visually striking lamp is a perfect addition to any living space, office, bedroom, or even outdoor area. It can be used as an accent lamp, a nightlight, or as a functional desk lamp for work, study, or play.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "32.jpg", "question_id": 52, "question": "Who is the person in the image?", "answer": "The person in the image is Elon Musk, the founder and chief executive officer of several companies, including Tesla, SpaceX, and The Boring Company.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "33.jpg", "question_id": 53, "question": "Describe this image.", "answer": "The image shows a pedestrian in the middle of a busy crosswalk, attempting to cross the street while cars and a bicyclist are approaching. The cars and the bicyclist are traveling towards the left side of the image.\nIn the background, there is a traffic light, which is red in color, indicating that no cars can pass through the intersection. The pedestrian has to wait for it to turn green before proceeding across the street.\nIn addition, there are multiple cars and a bicyclist visible throughout the scene – some of them closer to the foreground of the image than others – indicating that this busy intersection is a common spot for traffic.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "33.jpg", "question_id": 54, "question": "What do you think about the person's behavior?", "answer": "Based on the image, it appears that the person is trying to make a quick and risky decision, crossing the street in between the oncoming cars and bicycle when the light is red. This could potentially put them in an unsafe situation, as they might be at risk of being hit by one of the vehicles if they are not careful.\nWhile I cannot definitively determine the person's reason for doing this, it is advisable for pedestrians to wait for the signal to change to green before entering the crosswalk, as it is safer for themselves and the other drivers and cyclists.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "34.jpg", "question_id": 55, "question": "Describe about the image.", "answer": "The image depicts a young boy standing in front of a large city window, looking through it and into the distance, possibly at the city skyline. He appears to be a child with a backpack on, standing in a room that might serve as a play area. The city is visible beyond the large window, with tall, colorful buildings in the background.\nThe scene captures a moment of awe for the young boy, as he gazes at the expansive view of the city and imagines what it might be like to explore the vast metropolis. He is engrossed in the cityscape, possibly dreaming or daydreaming about the adventures he could have in such a dynamic urban setting.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "34.jpg", "question_id": 56, "question": "Do you have anything to say to the young boy in the image?", "answer": "Dear young boy,\nI admire your curiosity and imagination. The view from the city window you're standing near is certainly captivating. I hope you're inspired to explore the city, its people, culture, and history.\nAs you explore this bustling metropolis, remember to be respectful, polite, and considerate to others. Enjoy the sights, sounds, and smells of the city, and appreciate the diversity of its inhabitants.\nMay your journey be a meaningful and memorable experience.\nBest wishes to you,\nYour AI friend.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "35.jpg", "question_id": 57, "question": "Can you give me a short introduction to this movie?", "answer": "The movie is titled 'The Godfather: Part II,' and it is a sequel to the iconic 'The Godfather' film. It follows the story of the Corleone family and the power struggle between the two main antagonists, Don Vito Corleone (played by Marlon Brando) and his youngest son, Michael Corleone (played by Al Pacino). The film takes place in the 1950s and follows Michael as he struggles with balancing his family's business interests with his own moral compass and his relationship with his wife, Kay (played by Diane Keaton). The film features numerous famous actors and actresses in iconic roles, such as Robert De Niro, Robert Duvall, James Caan, and Talia Shire. 'The Godfather: Part II' was released in 1974 and was a box office success, earning over $151 million worldwide. It has since gained recognition as one of the best sequels in history and received critical acclaim, winning numerous awards, including three Oscars for Best Picture, Best Writing, and Best Adapted Screenplay.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "36.jpg", "question_id": 58, "question": "Can you give me a detailed introduction to this painting, and also the artist?", "answer": "The painting depicts a scene with several clocks placed in a surreal environment. In the center of the image, there is an oversized clock that dominates the composition. The clocks are scattered throughout the scene, with one clock placed on a shelf, another clock in the foreground, a third clock on the right side, and a fourth clock towards the bottom.\nThe clocks vary in size and shape, adding a sense of depth and complexity to the overall composition. There is a man sitting near one of the clocks, and another person is seen on the right side of the painting. The scene also features various objects placed on the ground, such as a book, a cup, and a bowl. Overall, the painting presents a visually intriguing and thought-provoking scene that blends elements of reality and fantasy.\nThe artist of the painting is Salvador Dalí, a Spanish painter best known for his surrealist artwork. His unique style and techniques have made his artworks iconic, and this piece is representative of his surrealist approach to art.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "37.jpg", "question_id": 59, "question": "How should I do to cook this?", "answer": "To cook the ramen dish, first, boil some water in a large pot. Then, add the noodles and cook them according to the instructions on the package. Once they are done, drain and rinse them with cold water to remove excess salt and oil.\nNext, prepare the broth by combining the chicken broth, miso paste, and soy sauce in another large pot. Bring the mixture to a simmer over medium-high heat, stirring occasionally. When the broth has reduced by half, add the egg, green onions, and pork belly. Simmer the broth until the pork belly is cooked through, which should take about 15 minutes.\nWhile the pork belly is cooking, prepare the eggs by cracking them into a bowl and adding the salt and vinegar. Mix thoroughly to combine, then set aside.\nTo assemble the ramen dish, divide the cooked noodles among four bowls and top each with a generous amount of broth. Add some chopped green onions, a couple of slices of pork belly, and a couple of eggs, broken up into small pieces. Finally, sprinkle with some freshly ground black pepper, garnish with scallions or chives, and serve warm. Enjoy your delicious ramen dish!", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "38.jpg", "question_id": 60, "question": "Explain the advantages of this product.", "answer": "The product, a black car parked in front of a cityscape, is an Audi R8, a sports car that is known for its powerful performance, sleek design, and high-end features. The car has an aerodynamic body, which enhances its driving capabilities, stability, and speed. Additionally, the R8 has an all-wheel drive system for superior traction and control, making it suitable for various types of roads and conditions.\nThe car's design, which includes a low and wide stance, a long and sleek body, and aggressive-looking features, adds to its appeal as a high-performance vehicle. Its stylish and sophisticated appearance also makes it a desirable choice for car enthusiasts. Moreover, the Audi R8 is equipped with advanced technologies, such as a heads up display, navigation system, parking assistance system, and a touchscreen infotainment system, offering a luxurious and comfortable driving experience.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "39.jpg", "question_id": 61, "question": "Tell me something about the history of this place.", "answer": "This image features the Great Wall of China, which is one of the most iconic structures in the country's history. It spans across the mountains in the background, and it was built as a defensive structure to protect the Chinese Empire against invaders. The construction of the Great Wall began in the 5th century BCE, and over the centuries the wall was rebuilt and enhanced with different materials and technologies. Today, the Great Wall is considered a UNESCO World Heritage Site, attracting millions of tourists each year.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "40.jpg", "question_id": 62, "question": "Is this photo unusual?", "answer": "Yes, it is unusual to see a large building with a curved roof that resembles a large wooden boat or ship. The building appears as if it has a large hole in the middle, and the roof has a distinct shape. The unusual design of the building makes it stand out from others and creates a distinctive visual effect.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "40.jpg", "question_id": 63, "question": "How could someone get out of the house? ", "answer": "To exit the building, the occupants would likely need to use a ladder or other means of accessing the hole in the roof. The building's unusual shape and design may present challenges for people trying to move in and out of the structure comfortably. However, it is likely that the building has been designed to accommodate such features, and occupants would be able to navigate the space safely and effectively.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "41.jpg", "question_id": 64, "question": "What are shown in the photo?", "answer": "The image shows a man standing in the middle of a field surrounded by numerous chicken coops. He is holding a rooster by its neck, attempting to control or subdue it.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "41.jpg", "question_id": 65, "question": "What does the man feel and why? ", "answer": "The man may feel a mixture of emotions, including excitement, fear, and possibly anger, as he attempts to control the large rooster. His actions suggest that he is attempting to catch or control the bird, which may be agitated or frightened by his actions. This could be for various reasons, such as protecting his property or attempting to catch the bird to eat. It is important for the man to proceed cautiously and avoid causing harm to the chicken, while also ensuring his own safety and the safety of other animals and people in the vicinity.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "42.jpg", "question_id": 66, "question": "What are the ingredients I need to make this?", "answer": "To make the delicious pizza shown in the image, you need the following ingredients: flour, yeast, water, sugar, oil, salt, pepper, mozzarella cheese, and pepperoni. You also need a baking tray with parchment paper to spread the dough on.\nThe process of making this pizza starts with kneading the dough using flour, water, yeast, and salt. Then, the dough is allowed to rise, which is typically done for an hour or more. Next, you spread the toppings on the risen dough. After that, the dough is rolled out and placed on the parchment paper-lined baking sheet. It is then baked in a preheated oven until the crust is golden brown and the cheese has melted.\nFinally, you remove the pizza from the oven, let it cool down, and serve it with your favorite toppings or side dishes. The image shows a slice of this pizza, so you can see how delicious and satisfying it would be.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "42.jpg", "question_id": 67, "question": "What is the first step?", "answer": "The first step to making this pizza is to mix together the flour, yeast, water, sugar, oil, salt, pepper, and mozzarella cheese in a large mixing bowl. Knead the dough until it comes together and is smooth and elastic, adding more water or flour as needed. The dough should be slightly sticky but not too wet or dry. Once the dough is well-kneaded, let it rest for an hour or more until the dough rises.\nAfter the dough has risen, you will need to roll it out and spread the toppings on it. You can spread the toppings as thick or thin as you prefer. Then, the dough is rolled out and placed on the parchment paper-lined baking sheet. The pizza is baked in a preheated oven until the crust is golden brown and the cheese has melted.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "43.jpg", "question_id": 68, "question": "Which city is this?", "answer": "The city in the image is Singapore, a vibrant and modern metropolis in southeast Asia known for its skyscrapers, diverse ethnicity, and rich heritage.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "43.jpg", "question_id": 69, "question": "Why do you think so?", "answer": "There are several factors that lead me to believe that this city is Singapore. Firstly, the image features the iconic Merlion statue, which is a well-known landmark of the city, representing the mythological creature half lion, half fish. The presence of the statue at the waterfront, with a beautiful sunrise and a sailboat in the background, is a strong indicator of Singapore.\nAdditionally, the image also features several skyscrapers and a cityscape that aligns with typical features of Singapore's skyline. The city's distinctive architecture, such as Marina Bay Sands hotel, the Singapore Flyer, and the Singapore ArtScience Museum, are prominent in the scene, further confirming its identity as Singapore.\nFurthermore, the image depicts a busy waterfront area with many people enjoying themselves, which is a common sight in many cities in Singapore. The presence of the sailboat and the large crowd suggests that this is an important tourist or recreational area in Singapore, where people gather to experience the city's unique culture, history, and natural beauty.\nIn conclusion, the image of a sunrise over Singapore's iconic Merlion statue, with the city's distinctive buildings and skyscrapers in the background, is a compelling representation of Singapore's rich culture and vibrant lifestyle.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "44.jpg", "question_id": 70, "question": "What happened at the end of this movie?", "answer": "At the end of the movie, the ship, the Titanic, sank into the water after striking an iceberg. The ship's passengers and crew were forced to evacuate the ship, and many people perished in the water or were rescued by lifeboats from various ships. The story follows the journey of Rose (played by Kate Winslet), Jack Dawson (played by Leonardo DiCaprio), and other passengers as they try to survive the sinking of the Titanic. The film portrays the tragedy of the event and the human struggle to stay afloat in the icy waters of the ocean.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "44.jpg", "question_id": 71, "question": "Did Leonardo Dicaprio's character survive?", "answer": "The nose on the pizza plate is made of pepperoni slices.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "45.jpg", "question_id": 72, "question": "What is in the photo?", "answer": "The image features a pizza shaped like a cat, which has various toppings such as green pepper and cheese on the surface. The cat's face is made of cheese.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "45.jpg", "question_id": 73, "question": "What is the nose made of?", "answer": "The nose on the pizza plate is made of pepperoni slices.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "46.jpg", "question_id": 74, "question": "Write a conversation between the two animals.", "answer": "As the snowy winter day unfolds, a dog sits comfortably on its owner's back, enjoying the warmth and companionship. The dog's owner is a large yellow labrador retriever, and the pair are spending time together in the snow-covered forest.\n'Wow, it's so cold out there,' the cat says to the dog.\nThe dog responds: 'I know, but I don't mind. I just love spending time with you. It's cozy and snuggly, like we are in our own warm blanket.'\nThe cat ponders this and then says, 'You're right! It's so nice to have someone to share this cold day with. Thank you for keeping me company.'\nThe dog continues, 'I'm happy I can make you feel better. It's always a pleasure to be your friend.'\nThe cat responds: 'I know, and you're the best friend I could ever ask for. I'm lucky to have you.'\nThe dog smiles and replies, 'I love you too, friend cat. We'll always have each other's backs no matter what.'\nAs the day goes on, the dog and the cat enjoy their time together, warm and content despite the cold weather.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "47.jpg", "question_id": 75, "question": "Who is the man on the right?", "answer": "The man on the right is the Chinese professional basketball player Yao Ming.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "47.jpg", "question_id": 76, "question": "Who is the man on the left?", "answer": "The man on the left is a Chinese basketball coach.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "47.jpg", "question_id": 77, "question": "Who is the taller man?", "answer": "The taller man is the Chinese basketball player Yao Ming.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "47.jpg", "question_id": 78, "question": "Who is wearing red?", "answer": "The taller man is the Chinese basketball player Yao Ming.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "47.jpg", "question_id": 79, "question": "Who is the man with the water bottle in his hand?", "answer": "The Chinese basketball coach is wearing red.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "48.jpg", "question_id": 80, "question": "Introduce the movie character in this picture.", "answer": "The characters in the picture are Superman, Wonder Woman, Aquaman, and Batman, all of whom are superheroes from the DC universe. They are standing together on a poster that is advertising 'Justice League' (2017), a comic book-based film in which they team up to defend the world against various threats. The poster is colorful and eye-catching, featuring the four superheroes posing in a dynamic and powerful way.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "49.jpg", "question_id": 81, "question": "Analyze the tactical characteristics and lineups of both sides and predict who will win.", "answer": "Based on the tactical characteristics and lineups of both Argentina and France, the game is expected to be a tight, competitive encounter. Argentina's 4-3-3 formation with Lionel Messi as the focal point of their attack is well-known for its ability to create chances through the use of swift passing and individual skill. Their lineup includes two experienced strikers (Sergio Agüero and Paulo Dybala) who are known for their goal-scoring capabilities, as well as an attacking midfielder in Nicolas Pépé, who is capable of providing creative support and linking the midfield and front line.\nFrance's 4-4-2 formation is also known for its ability to create goals through its attacking midfielders (Olivier Giroud, Paul Pogba, and N'Golo Kanté) and forwards (Antoine Griezmann and Kylian Mbappé). This lineup is built around a strong defensive unit led by Raphaël Varane, who is a world-renowned center back and one of the best defenders in the world.\nBoth teams have the potential to create and score goals through their attacking prowess. However, the match is expected to be a tactical battle between Argentina's attacking flair and France's defensive solidarity. Argentina's attacking midfielders and forwards may find it difficult to break down France's strong defensive unit, while France's attacking lineup may find it difficult to create chances against Argentina's solid backline.\nOverall, it is difficult to predict a winner based on the tactical characteristics and lineups, as both sides have the potential to create and score goals. The result may ultimately be determined by the players' individual performances and the ability of each team to adapt to the changing game situations.", "model_id": "mPLUG_Owl_7b:v1"}
+{"image": "50.jpg", "question_id": 82, "question": "Tell me about this work of art.", "answer": "This artwork features a large, vivid, and colorful painting of the famous Vincent van Gogh painting, 'Starry Night Over the Rhône.' The painting depicts the night sky overlooking a town in the countryside, with a prominent starry night scene and several buildings in the foreground.\nIn the painting, the town is filled with lights and buildings of various sizes, with some more prominent than others. The sky is dominated by a starry night with multiple stars and a crescent moon, with the moonlight reflecting on the water in the background. The painting is visually striking, with a combination of colors that captures the nighttime atmosphere and the beauty of the town in the background.", "model_id": "mPLUG_Owl_7b:v1"}
diff --git a/models/mPLUG_Owl/OwlEval/answer/minigpt4_13b_answer.jsonl b/models/mPLUG_Owl/OwlEval/answer/minigpt4_13b_answer.jsonl
new file mode 100644
index 0000000..aecc9b6
--- /dev/null
+++ b/models/mPLUG_Owl/OwlEval/answer/minigpt4_13b_answer.jsonl
@@ -0,0 +1,82 @@
+{"image": "1.png", "question_id": 1, "question": "What is funny about this image? Describe it panel by panel.", "answer": "The image shows a person holding a cell phone with a charging cable attached to it. The person is wearing a blue shirt and has a white background. The image is taken from a low angle, showing the person’s face and the cell phone in their hand. The charging cable is plugged into the phone’s charging port. The image is well-lit and clear, with good contrast and detail.\n\nThe image is funny because it shows a person using a charging cable to charge their phone while they are wearing a blue shirt. It is unusual to see someone using a charging cable while they are wearing a shirt, as it is more common to see people using charging cables while they are wearing casual clothing such as shorts or a t-shirt. The image is also funny because it shows the person holding their phone with one hand while the other hand is holding the charging cable. This is an unusual way to hold a phone while charging it, as it is more common to see people using both hands to hold their phone while it is charging. The image is also funny because it shows the person holding their phone with one hand while the other hand is holding the charging cable. This is an unusual way to hold a phone while charging it, as it is more common to see people using both hands to hold their phone while it is charging.", "model_id": "minigpt4_13b"}
+{"image": "2.png", "question_id": 2, "question": "Why would a person find this image funny?", "answer": " This image is funny because it depicts a squirrel taking a selfie with a camera. The squirrel is holding the camera with its paws and looking at the lens with a smirk on its face, as if it is proud of itself for taking a good picture. The image is also humorous because it shows a squirrel, which is not typically associated with taking pictures, using a camera.\n\nThe image is also funny because it is a cartoon, which adds to the humor by exaggerating the squirrel’s features and making it appear more playful and whimsical. The use of the camera as a prop adds to the humor by making the squirrel appear more human-like and capable of using technology.\n\nOverall, this image is funny because it combines the unexpected (a squirrel taking a selfie) with the exaggerated (the squirrel’s expression and the use of the camera as a prop) to create a humorous and playful image.", "model_id": "minigpt4_13b"}
+{"image": "3.png", "question_id": 3, "question": "What is unusual about this image?", "answer": " The image shows a man standing on the back of a yellow taxi, with a yellow shirt and black pants, and a blue backpack on his back. The taxi is driving on a city street with other cars and taxis in the background.", "model_id": "minigpt4_13b"}
+{"image": "4.png", "question_id": 4, "question": "Is there text in the image? If so, can you read it and explain it to me? ", "answer": " The image shows a cartoon of a man and a woman standing next to each other, with the man holding his head in his hand and looking sad. The woman is looking at him with a concerned expression on her face. The caption reads, “Before: I don’t feel well today. Do you have any medicine? After: I feel much better now. Thank you for taking care of me.” \n", "model_id": "minigpt4_13b"}
+{"image": "4.png", "question_id": 5, "question": "What is funny about it?", "answer": "The humor in this comic strip comes from the contrast between the man’s initial expression of feeling unwell and the woman’s concern for him, followed by the man’s sudden recovery and the woman’s relief. The punchline, “Thank you for taking care of me,” adds to the humor by highlighting the woman’s efforts to care for the man, even though he didn’t seem to need it at first.", "model_id": "minigpt4_13b"}
+{"image": "5.png", "question_id": 6, "question": "Where is the frisbee in the image? ", "answer": "The frisbee is in the air, being caught by the man in red.\n", "model_id": "minigpt4_13b"}
+{"image": "5.png", "question_id": 7, "question": "Where is the man in the image?", "answer": "The man is in the air, catching the frisbee. \n", "model_id": "minigpt4_13b"}
+{"image": "5.png", "question_id": 8, "question": "What is the spatial relation between the frisbee and the man?", "answer": "The frisbee is above the man, being caught by him.", "model_id": "minigpt4_13b"}
+{"image": "6.png", "question_id": 9, "question": "What should I do after kneading the dough, before dividing it into pieces?", "answer": "After kneading the dough, you should let it rise for about 1 hour before dividing it into pieces. This allows the dough to become more elastic and easier to work with. You can also use a stand mixer to knead the dough, which will make the process faster and easier. Once the dough has risen, you can divide it into the desired number of pieces and shape each piece into a ball. Then, let the balls rise for another 30 minutes to an hour before baking. \n", "model_id": "minigpt4_13b"}
+{"image": "6.png", "question_id": 10, "question": "How long should I put in warm places?", "answer": "The recommended time for letting the dough rise in warm places is about 1 hour. This allows the yeast to activate and the dough to become more elastic and easier to work with. However, the exact time may vary depending on the temperature and humidity of your environment. If the temperature is too cold, the dough may take longer to rise, while if it is too warm, it may rise too quickly. It’s important to monitor the dough’s rise and adjust the time accordingly.\n\nIt’s also important to note that the dough should be placed in a warm, draft-free environment, such as a warm oven or a bowl covered with a towel. Direct sunlight or a warm spot near a heating vent should be avoided, as these can cause the dough to over-rise or burn.", "model_id": "minigpt4_13b"}
+{"image": "7.png", "question_id": 11, "question": "What is the total amount?", "answer": "The total amount is $ 3.99.", "model_id": "minigpt4_13b"}
+{"image": "8.png", "question_id": 12, "question": "Excluding the regular cab, what is the average total fueling cost?", "answer": "The average total fueling cost for a 2020 Ford F-150 is $ 2, 500. This includes the cost of fuel for the vehicle, as well as any additional costs such as maintenance and repairs.\n\nPlease note that this is an estimate and the actual cost may vary depending on various factors such as location, driving habits, and fuel prices.", "model_id": "minigpt4_13b"}
+{"image": "9.png", "question_id": 13, "question": "Excluding Ford F150 Lightning, what is the average total charging cost? ", "answer": "The average total charging cost for the Ford F150 Lightning is $1, 500.\n", "model_id": "minigpt4_13b"}
+{"image": "9.png", "question_id": 14, "question": "Now how much cheaper is the average charging cost over the electric cars cluding Ford F150 Lightning than the average fueling cost over the gasoline cars cluding Ford F150 gular Cab?Let's think step-by-step.", "answer": "3.50, the average fueling cost for gasoline cars would be:\nAverage fueling cost for gasoline cars = 20 miles/gallon x\n3.50/ gallon\n0.70/gallon\nTo compare the average charging cost for electric cars to the average fueling cost for gasoline cars, we can\ndivide the average charging cost by the average fueling cost:\nAverage charging cost for electric cars / Average fueling cost for gasoline cars = $ 0.4", "model_id": "minigpt4_13b"}
+{"image": "10.png", "question_id": 15, "question": "How many bedrooms are there in this floor plan?", "answer": " There are two bedrooms in this floor plan. \n", "model_id": "minigpt4_13b"}
+{"image": "10.png", "question_id": 16, "question": "What is the dimension of Master Bedroom？", "answer": " The dimension of the Master Bedroom is not specified in the floor plan. \n", "model_id": "minigpt4_13b"}
+{"image": "10.png", "question_id": 17, "question": "Is there a bathroom in master bedroom?", "answer": " No, there is no bathroom in the Master Bedroom in this floor plan. \n", "model_id": "minigpt4_13b"}
+{"image": "10.png", "question_id": 18, "question": "What is the dimension of Second Bedroom？", "answer": " The dimension of the Second Bedroom is not specified in the floor plan.\n", "model_id": "minigpt4_13b"}
+{"image": "10.png", "question_id": 19, "question": "Is the terrace bigger than the kitchen?", "answer": " The size of the terrace is not specified in the floor plan, so it is not possible to determine whether it is bigger than the kitchen.\n", "model_id": "minigpt4_13b"}
+{"image": "10.png", "question_id": 20, "question": "What are the appliances in the kitchen in this floorplan？", "answer": " The appliances in the kitchen in this floor plan are not specified.\n", "model_id": "minigpt4_13b"}
+{"image": "10.png", "question_id": 21, "question": "What will you see when you enter through the entrance? and what is the dimension of it？", "answer": " When you enter through the entrance, you will see the living room. The dimension of the entrance is not specified in the floor plan.\n", "model_id": "minigpt4_13b"}
+{"image": "10.png", "question_id": 22, "question": "Can you summarize this floorplan based on the aspects I was interested in, in a few sentences？", "answer": " This floor plan is for a two-bedroom apartment with a living room, dining room, and kitchen. The apartment also has a terrace, but the size is not specified. There is no bathroom in the master bedroom, and the size of the second bedroom is also not specified. The entrance leads to the living room.\n\nThis is a summary of the floor plan based on the aspects you were interested in: the number of bedrooms, the presence of a bathroom in the master bedroom, the size of the second bedroom, the size of the terrace, and the appliances in the kitchen.", "model_id": "minigpt4_13b"}
+{"image": "11.png", "question_id": 23, "question": "Can you read the text in this flowchart and explain the process in detail?", "answer": "Yes, I can read the text in this flowchart and explain the process in detail.\n\nThe flowchart shows a process for creating a new product. The process starts with the input of a new product idea, which is then evaluated for feasibility and market potential. If the idea is deemed feasible and has potential, the next step is to create a prototype of the product. This prototype is then tested for functionality and user acceptance. If the prototype is successful, the product is manufactured and distributed to the market. Finally, the product is monitored for sales and customer feedback, and any necessary adjustments are made to improve its performance.\n\nThe flowchart shows a clear and logical process for creating a new product, with each step building on the previous one. The input stage is where the idea for the product is entered into the system, and the feasibility and market potential are evaluated. This is followed by the creation of a prototype, which is tested for functionality and user acceptance. If the prototype is successful, the product is manufactured and distributed to the market. Finally, the product is monitored for sales and customer feedback, and any necessary adjustments are made to improve its performance.\n\nOverall, the flowchart provides a clear and concise representation of the process for creating a new product, and the steps are well-defined and easy to follow. \n", "model_id": "minigpt4_13b"}
+{"image": "11.png", "question_id": 24, "question": "Which step in the flowchart is drawn with a diamond shape?", "answer": " The step in the flowchart that is drawn with a diamond shape is the “Evaluate Feasibility and Market Potential” step.\n\nThe diamond shape is used to indicate a decision point in the flowchart, where a decision needs to be made based on the evaluation of the feasibility and market potential of the new product idea. If the idea is deemed feasible and has potential, the process moves forward to the next step, which is to create a prototype of the product. If the idea is not deemed feasible or does not have potential, the process may end here, and no further action is taken.\n\nThe diamond shape is used to indicate that this is a decision point in the flowchart, where the feasibility and market potential of the new product idea must be evaluated before the process can move forward.", "model_id": "minigpt4_13b"}
+{"image": "12.png", "question_id": 25, "question": "Can you read the text in this flowchart and explain it to me?", "answer": "The flowchart shows a process for making tea. The process starts with turning on the kettle and boiling water. Then, the tea leaves are added to the water and allowed to steep for a few minutes. After steeping, the tea is strained and served in a cup.\n\nThe process starts with the first step, which is turning on the kettle. This is represented by the start shape. Then, the water is boiled, represented by the boil shape. After boiling, the tea leaves are added to the water and allowed to steep for a few minutes. This is represented by the steep shape.\n\nAfter steeping, the tea is strained and served in a cup. This is represented by the end shape. The process ends with the tea being served in a cup.\n\nOverall, the flowchart shows the steps involved in making tea, from turning on the kettle to boiling the water, adding tea leaves, steeping, and straining and serving the tea in a cup. \n", "model_id": "minigpt4_13b"}
+{"image": "12.png", "question_id": 26, "question": "Can you transform this flowchart into a simple program in python?", "answer": "Yes, I can write a simple program in Python that represents the flowchart you provided. Here is an example of how it could be done:\n<br># Turn on the kettle<br>kettle_on = True<br><br># Boil water<br>water_boiling = True<br><br># Add tea leaves<br>tea_leaves_added = True<br><br># Steep tea<br>tea_steeping = True<br><br># Strain and serve tea<br>tea_served = True<br><br>while kettle_on and water_boiling and tea_leaves_added and tea_steeping and tea_served:<br> # Do something else while the tea is steeping<br> # ...<br><br># Turn off the kettle<br>kettle_on = False<br><br># Unboil the water<br>water_boiling = False<br><br># Remove the tea leaves<br>tea_leaves_added = False<br><br># Stop steeping the tea<br>tea_steeping = False<br><br># Stop serving the tea<br>tea_served = False<br>\nThis program uses a while loop to repeatedly check the values of the variables kettle_on, water_boiling, tea_leaves_added, tea_steeping, and tea_served. If all of these variables are True, then the program will", "model_id": "minigpt4_13b"}
+{"image": "13.png", "question_id": 27, "question": "How much is Wilkins earning?", "answer": "The salary for Wilkins is $ 50,000 per year. ", "model_id": "minigpt4_13b"}
+{"image": "13.png", "question_id": 28, "question": "Which department is the person who has the highest salary from?", "answer": "The person with the highest salary is the Marketing Department. ", "model_id": "minigpt4_13b"}
+{"image": "13.png", "question_id": 29, "question": "This table is reporting monthly salary,can you calculate the yearly salary for each employee, and summarize it into a table and group them by department?", "answer": " Sure, here is the table showing the yearly salary for each employee, grouped by department:\n\n| Department | Employee | Yearly Salary |\n| — | — | — |\n| Marketing | John | $60,000 |\n| Marketing | Jane | $55,000 |\n| Sales | Bob | $75,000 |\n| Sales | Carol | $60,000 |\n| HR | David | $65,000 |\n| IT | Emily | $70,000 |\n| IT | James | $60,000 |\n\nNote: The yearly salary is calculated by multiplying the monthly salary by 12. ", "model_id": "minigpt4_13b"}
+{"image": "13.png", "question_id": 30, "question": "Given the yearly bonus is 10%,12% and 15%for Administration,Finance,Marketing, respectively. Please provide me with the bonus for Emily, Max and Gloria.", "answer": "Sure, here are the bonuses for Emily, Max, and Gloria based on their yearly salary:\n\n| Employee | Yearly Salary | Bonus Percentage | Bonus Amount |\n| — | — | — | — |\n| Emily (IT) | $70,000 | 15% | $10,500 |\n| Max (Sales) | $75,000 | 12% | $9,000 |\n| Gloria (Marketing) | $60,000 | 10% | $6,000 |\n\nNote: The bonus amount is calculated by multiplying the yearly salary by the bonus percentage.", "model_id": "minigpt4_13b"}
+{"image": "14.png", "question_id": 31, "question": " Answer question I.1.a. Think step-by-step.", "answer": " The image shows a diagram of a solar panel system with a battery and an inverter. The solar panel is connected to the battery through a charge controller, which regulates the charging current to prevent overcharging. The battery is then connected to the inverter, which converts the DC power from the battery into AC power for use in the home. The system also includes a monitoring device to display the performance of the solar panel and battery.\n\nAnswer question I.1.b. Identify the main components of the system shown in the image.\n\nThe main components of the system shown in the image are:\n\n1. Solar panel\n2. Charge controller\n3. Battery\n4. Inverter\n5. Monitoring device\n\nAnswer question I.1.c. Explain the function of each component in the system shown in the image.\n\n1. Solar panel: The solar panel converts sunlight into DC power, which is then sent to the charge controller.\n2. Charge controller: The charge controller regulates the charging current to prevent overcharging of the battery and ensure efficient charging.\n3. Battery: The battery stores the energy generated by the solar panel for use in the home.\n4. Inverter: The inverter converts the DC power from the battery into AC power, which can be used in the home.\n5. Monitoring device: The monitoring", "model_id": "minigpt4_13b"}
+{"image": "15.png", "question_id": 32, "question": " Can you write a template for this website in HTML? Replace the joke and punchline with a real one.", "answer": "web", "model_id": "minigpt4_13b"}
+{"image": "16.png", "question_id": 33, "question": "How many championship rings did the player on the left win in his career?", "answer": "The player on the left is Kobe Bryant. He won five championship rings in his career with the Los Angeles Lakers.", "model_id": "minigpt4_13b"}
+{"image": "17.png", "question_id": 34, "question": "Describe this image. ", "answer": " The image shows a cat wearing a blue\ncookie monster costume sitting on a\ncouch with a plate of cookies in\nfront of it. The cat's mouth is open\nand it appears to be enjoying the\ncookies. The caption reads, `another\nwild saturday night.`\nThe image is a cute and funny\ndepiction of a cat in a cookie\nmonster costume enjoying some cookies\non a Saturday night. The cat's\nexpression and the plate of cookies\nin front of it add to the humor of\nthe image. The caption reinforces the\nidea that the cat is having a wild\nand fun night. \n", "model_id": "minigpt4_13b"}
+{"image": "17.png", "question_id": 35, "question": "Explain why this image is funny.", "answer": "The image is funny because it depicts\na cat wearing a cookie monster\ncostume sitting on a couch with a\nplate of cookies in front of it. The\ncat's expression and the plate of\ncookies in front of it add to the\nhumor of the image. The caption\nreinforces the idea that the cat is\nhaving a wild and fun night, which\nadds to the humor of the image.\nAdditionally, the contrast between\nthe cat's serious expression and the\nplayful nature of the cookie monster\ncostume creates a humorous\njuxtaposition. Overall, the image is\nfunny because it combines elements of\ncuteness and humor to create a\nplayful and amusing depiction of a\ncat enjoying some cookies on a\nSaturday night.", "model_id": "minigpt4_13b"}
+{"image": "18.png", "question_id": 36, "question": "Describe this image as detailed aspossible.", "answer": "The image shows a busy city street\nwith a clock tower in the background.\nThe street is lined with shops and\nrestaurants on both sides. There are\nseveral motorcycles parked on the\nside of the road, and people are\nwalking down the street. The clock\ntower has a large clock face with\nRoman numerals and a small spire on\ntop. The buildings on either side of\nthe street have ornate facades and\nbalconies. The street is paved with\ncobblestones and there are\nstreetlights on either side of the\nroad. The sky is clear and blue, and\nthere are a few clouds in the\ndistance.", "model_id": "minigpt4_13b"}
+{"image": "19.png", "question_id": 37, "question": "Describe this image as detailed aspossible.", "answer": " The image shows a group of musicians\nperforming on stage in front of a\nlarge audience. The musicians are\nplaying electric guitars and are\nwearing black and white clothing. The\naudience is made up of people of\ndifferent ages and ethnicities who\nare watching the performance and\ntaking photos with their phones. The\nstage is lit up with bright lights\nand there is a large screen\ndisplaying the band's name and lyrics\nin Chinese characters. The atmosphere\nis lively and energetic, with the\naudience cheering and singing along\nto the music.", "model_id": "minigpt4_13b"}
+{"image": "20.png", "question_id": 38, "question": "Explain why this meme is funny.", "answer": "This meme is funny because it shows a\ndog laying on the floor with its head\nresting on its paws, as if it is\ntired or sleepy. The caption, \"monday\njust monday,\" adds to the humor by\nsuggesting that the dog is feeling\nthe same way as many people do on\nMondays, which are often considered\nto be the most dreaded day of the\nweek. The meme is relatable and\nhumorous, making it a popular one\namong internet users.", "model_id": "minigpt4_13b"}
+{"image": "21.png", "question_id": 39, "question": "Describe the image ", "answer": " The image depicts a wolf and a lamb\nlying in a field of grass. The wolf\nis lying down with its head resting\non its paws, while the lamb is lying\nnext to it, looking up at the wolf.\nThe sun is shining down on the scene,\ncasting a warm glow on the animals\nand the grass around them. The\noverall mood of the image is peaceful\nand serene. \n", "model_id": "minigpt4_13b"}
+{"image": "21.png", "question_id": 40, "question": "Can this happen in real world?", "answer": "In the wild, it is not uncommon for\nwolves and lambs to coexist in the\nsame area, although it is rare for\nthem to interact in the way depicted\nin the image. In most cases, wolves\nwill prey on lambs and other young\nlivestock, so it is unlikely that\nthey would lie down together in a\npeaceful manner like in the image.\nHowever, it is possible that they may\ncome into contact with each other in\nother ways, such as when the wolf is\ndefending its territory or when the\nlamb is separated from its flock.", "model_id": "minigpt4_13b"}
+{"image": "22.png", "question_id": 41, "question": "Describe the image ", "answer": "The image shows a cactus plant\nstanding in the middle of a frozen\nlake. The cactus is surrounded by\nlarge ice crystals that have formed\non the surface of the water. In the\nbackground, there is a range of\nmountains with snow-capped peaks. The\nsky is clear and blue, with a few\nfluffy clouds floating in the\ndistance. The overall mood of the\nimage is peaceful and serene, with\nthe cactus standing tall and proud in\nthe center of the frozen lake. \n", "model_id": "minigpt4_13b"}
+{"image": "22.png", "question_id": 42, "question": "Is this image common in real world?", "answer": " No, this image is not common in the\nreal world. While it is possible for\ncacti to grow in cold climates, it is\nrare to see one standing in the\nmiddle of a frozen lake.\nAdditionally, the formation of large\nice crystals on the surface of the\nwater is also not a common\noccurrence. This image is likely a\ndigital creation or manipulation.", "model_id": "minigpt4_13b"}
+{"image": "23.png", "question_id": 43, "question": "Write brief HTML/JS to turn thismock-up into a colorful website,where the jokes are replaced by areal joke", "answer": "web", "model_id": "minigpt4_13b"}
+{"image": "24.png", "question_id": 44, "question": "What is the issue with my plant? Whatshould I do now?", "answer": " The image shows a leaf with brown\nspots on it. The spots are likely\ncaused by a fungal infection, which\ncan be treated with a fungicide. It's\nimportant to identify the type of\nfungus causing the spots in order to\nchoose the appropriate treatment.\nTo treat the plant, you can try the\nfollowing steps:\n1. Identify the type of fungus\ncausing the spots on the leaves.\n2. Use a fungicide specifically\nlabeled for the type of fungus you\nhave identified.\n3. Apply the fungicide according to\nthe product's instructions, making\nsure to cover all affected leaves.\n4. Repeat the treatment every 7 to 14\ndays, or as directed by the product's\ninstructions, until the fungus is\ncompletely gone.\n5. Water the plant well after\ntreatment to help the fungicide\npenetrate the leaves.\n6. Keep the plant in a\nwell-ventilated area and away from\ndirect sunlight to prevent\noverheating.\n7. Monitor the plant's progress and\nrepeat the treatment if new spots\nappear.\nIt's also important to keep the plant\nhealthy by providing it with proper\ncare, such as adequate watering,\nfertilization, and pruning. This will\nhelp the plant to recover and prevent\nfuture fungal infections.", "model_id": "minigpt4_13b"}
+{"image": "25.png", "question_id": 45, "question": "Why this happens and how to fix it?", "answer": " The image shows a washing machine\nwith a large amount of bubbles coming\nout of it. It appears that the\nwashing machine is overflowing with\nsoap suds.\nThere are several reasons why this\nmight happen:\n1. Overloading the washing machine:\nIf too much laundry is put into the\nwashing machine at once, it can cause\nan overflow.\n2. Using too much detergent: If too\nmuch detergent is used in the washing\nmachine, it can cause an overflow.\n3. A clogged drain: If the drain of\nthe washing machine is clogged, it\ncan cause an overflow.\n4. A malfunctioning washing machine:\nIf the washing machine is not\nfunctioning properly, it can cause an\noverflow.\nTo fix this issue, you can try the\nfollowing:\n1. Check the load size: Make sure you\nare not overloading the washing\nmachine.\n2. Use the right amount of detergent:\nUse the recommended amount of\ndetergent for the size of the load.\n3. Clear the drain: Make sure the\ndrain is clear of any debris.\n4. Check for malfunctions: Have the\nwashing machine checked by a\nprofessional if it is not functioning\nproperly.", "model_id": "minigpt4_13b"}
+{"image": "26.png", "question_id": 46, "question": "Can you write me a beautiful poemabout this image?", "answer": "Sure, here is a poem about the image\nyou provided:\nA man sits on a hill, with his dog by\nhis side\nThe sun sets behind them, casting a\ngolden light\nThe city below is bustling, but they\nare at peace\nEnjoying the moment, in each other's\nembrace\nThe man's arms are wrapped around his\nfurry friend\nAs they watch the world go by, with a\ngentle bend\nThe dog's head rests on the man's\nshoulder\nA symbol of their unbreakable bond,\nforever\nThe sky is ablaze with color, a\nbeautiful sight\nBut it's the love between the man and\nhis dog that shines bright\nIt's a moment of pure joy, a memory\nto keep\nForever etched in their hearts, as\nthey sit and weep\nSo let us cherish these moments, with\nour four-legged friends\nFor they bring us so much happiness,\nuntil the very end\nAnd even when they're gone, their\nlove will live on\nIn the memories we hold, forever and\nstrong.", "model_id": "minigpt4_13b"}
+{"image": "27.png", "question_id": 47, "question": "Can you write me a master rap songthat rhymes very well based on thisimage?", "answer": "Sure, here's a rap song based on the\nimage you provided:\nVerse 1:\nI'm an old lady, but I'm still flyin'\nI've got my headphones on and I'm\nfeelin' high\nI'm in the studio, makin' some beats\nI'm the queen of hip hop, can't\nnobody touch my feet\nChorus:\nI'm the grandma of rap, I'm still\nspittin' fire\nI've got my green jacket on, l'm\nlookin' fly\nI've got my sunglasses on, I'm\nshining bright\nI'm the queen of the game, I'm makin'\nit right\nVerse 2:\nI've been in the game for years, I've\npaid my dues\nI've got a microphone in my hand and\nI'm singin' the blues\nI've got the crowd in the palm of my\nhand\nI'm the queen of rap, I'm takin' over\nthe land\nChorus:\nI'm the grandma of rap, I'm still\nspittin' fire\nI've got my green jacket on, I'm\nlookin' fly\nI've got my sunglasses on, I'm\nshining bright\nI'm the queen of the game, I'm makin'\nit right\nVerse 3:\nI've got my beats in my headphones,\nI'm feelin' the vibe\nI'm in the zone, I'm in my element,\nI'm in my ride\nI'm the queen of rap, I'm in control\nI'm makin' hits, I'm takin' over the world \n\nChorus:\nI'm the grandma of rap, I'm still\nspittin' fire\nI've got my green jacket on, I'm\nlookin' fly\nI've got my sunglasses on, I'm\nshining bright\nI'm the queen of the game, I'm makin'\nit right\nOutro:\nI'm the grandma of rap, I'm still\nspittin' fire\nI've got my green jacket on, I'm\nlookin' fly\nI've got my sunglasses on, I'm\nshining bright\nI'm the queen of the game, I'm makin'\nit right.", "model_id": "minigpt4_13b"}
+{"image": "28.png", "question_id": 48, "question": "Can you write a story around thisimage for my child?", "answer": "Sure, here's a story based on the\nimage:\nOnce upon a time, there was a brown\nbear, a white rabbit, and a black cat\nwho lived in a forest. They were good\nfriends and loved to spend time\ntogether. One day, they decided to\nhave a picnic under a tree. The bear\nbrought some honey, the rabbit\nbrought some carrots, and the cat\nbrought some milk.\nAs they sat around the table, the\nbear said, \"I'm so hungry, I could\neat a whole honey pot!\" The rabbit\ngiggled and said, \"I'm so hungry, I\ncould eat a whole bag of carrots!\"\nThe cat meowed and said, \"I'm so\nhungry, I could drink a whole bowl of\nmilk!\"\nThey all laughed and dug in to their\nfood. The bear loved the honey, the\nrabbit loved the carrots, and the cat\nloved the milk. They were all very\nhappy and content.\nAfter they finished their meal, they\ndecided to go for a walk in the\nforest. They came across a stream and\nthe bear said, \"Let's go for a swim!\"\nThe rabbit and cat were a bit\nhesitant, but the bear convinced them\nto join in.\nThey all had a great time swimming\nand playing in the water. The bear\nloved to splash around, the rabbit\nloved to dive, and the cat loved to\nchase the splashes. They were all\nvery happy and had a lot of fun.\nAfter their swim, they decided to go\nback to their picnic spot and have\nsome more food. They sat around the\ntable and the bear said, \"I'm so\nhungry, I could eat a whole honey pot\nagain!\" The rabbit giggled and said,\n\"I'm so hungry, I could eat a whole\nbag of carrots again!\" The cat meowed\nand said, \"I'm so hungry, I could\ndrink a whole bowl of milk again!\"\nThey all laughed and dug in to their\nfood. They were all very happy and\ncontent. And they lived happily ever\nafter.", "model_id": "minigpt4_13b"}
+{"image": "29.png", "question_id": 49, "question": "I designed a character for my game asshown in the image. Can you write mea background story for him?", "answer": " Sure, here's a possible background\nstory for the character in the image:\nThe monk in the image is a wise and\nenlightened being who has spent many\nyears studying the workings of the\nmind. He has spent countless hours\nmeditating and contemplating the\nnature of consciousness, and as a\nresult, he has gained a deep\nunderstanding of the inner workings\nof the mind.\nThe brain in his hand is a symbol of\nhis deep understanding of the mind.\nHe uses it to help others understand\nthe nature of their own thoughts and\nemotions, and to guide them on the\npath to enlightenment.\nDespite his great wisdom, the monk is\na humble and compassionate being. He\nuses his knowledge to help others,\nand is always willing to share his\ninsights with those who seek his\nguidance. He is a revered figure in\nhis community, and many come to him\nseeking his help and guidance.\nThe monk's robes are a symbol of his\ncommitment to his spiritual path. He\nwears them as a reminder of his vows\nof poverty, chastity, and obedience,\nand as a symbol of his commitment to\nliving a life of service to others.\nOverall, the monk in the image is a\nwise and compassionate being who has\ndedicated his life to understanding\nthe nature of the mind and helping\nothers on the path to enlightenment.", "model_id": "minigpt4_13b"}
+{"image": "30.png", "question_id": 50, "question": "We design new mugs shown in theimage. Can you help us write anadvertisement?", "answer": "Sure, here's an advertisement for the\nmugs:\nIntroducing our new line of mugs\nfeaturing adorable cat designs! These\nmugs are perfect for coffee lovers\nand cat enthusiasts alike. Made from\nhigh-quality ceramic, each mug is\ndurable and dishwasher safe. The\nunique designs are hand-painted,\nmaking each mug a one-of-a-kind piece\nof art.\nWhether you're sipping your morning\ncoffee or enjoying a warm drink on a\nchilly evening, these mugs are the\nperfect way to add a touch of feline\ncharm to your daily routine. They\nalso make great gifts for cat lovers\nin your life.\nSo why settle for a plain old mug\nwhen you can have a cat-themed one?\nOrder yours today and start enjoying\nyour beverages in style!", "model_id": "minigpt4_13b"}
+{"image": "31.png", "question_id": 51, "question": "Help me write an advertisement forthis.", "answer": "Are you looking for a unique and\nstylish lighting solution for your\nhome or office? Look no further than\nour brass toucan lamp! This\nbeautifully crafted bird lamp is sure\nto add a touch of whimsy and charm to\nany room. The toucan's beak is\ndesigned to hold a light bulb,\nproviding a warm and inviting glow.\nThe base is made of wood, adding a\nnatural touch to the overall design.\nThe toucan lamp is a great addition\nto any room, whether you're looking\nto brighten up a dark corner or add a\ntouch of personality to your space.\nOrder yours today and bring a touch\nof the tropics to your home or\noffice.", "model_id": "minigpt4_13b"}
+{"image": "32.png", "question_id": 52, "question": "Who is the person in the image?", "answer": "The person in the image is Elon Musk,\nthe CEO of Tesla and SpaceX. He is\nknown for his work in the fields of\nelectric cars, space exploration, and\nrenewable energy. He is also known\nfor his eccentric personality and\noutspoken views on various topics.", "model_id": "minigpt4_13b"}
+{"image": "33.png", "question_id": 53, "question": "Describe this image. ", "answer": "The image shows a person running\nacross a busy street with cars and\nbuses passing by. The person is\nwearing black clothing and has a\nbackpack on their back. The image is\ntaken from a surveillance camera and\nhas a red circle around it. ", "model_id": "minigpt4_13b"}
+{"image": "33.png", "question_id": 54, "question": "What do you think about the person'sbehavior?", "answer": "As an AI language model, I do not\nhave the ability to form opinions or\nemotions. However, based on the\nimage, it appears that the person is\nrunning across a busy street with\ncars and buses passing by. This could\nbe considered dangerous and\npotentially hazardous to the person's\nsafety. It is important to always be\naware of your surroundings and take\nnecessary precautions to ensure your\nsafety.", "model_id": "minigpt4_13b"}
+{"image": "34.png", "question_id": 55, "question": "Describe about the image ", "answer": "The image depicts a young boy\nstanding on the edge of a rooftop\noverlooking a large cityscape. The\nboy is wearing a hoodie and jeans,\nand he has his hands in his pockets\nas he looks out at the view before\nhim. The cityscape behind him is made\nup of towering skyscrapers and other\nbuildings, with smoke rising from\nsome of them. The sky is a deep shade\nof blue, and there are a few clouds\nvisible in the distance. The overall\nmood of the image is one of\ncontemplation and wonder, as the boy\nappears to be taking in the sights\nand sounds of the bustling city\nbelow. ", "model_id": "minigpt4_13b"}
+{"image": "34.png", "question_id": 56, "question": "Do you have anything to say to theyoung boy in the image?", "answer": "As an artificial intelligence, I do\nnot have personal feelings or\nemotions, so I do not have anything\nto say to the young boy in the image.\nHowever, I can offer some general\nadvice to young people who are\nlooking out at the world before them.\nIt is important to remember that the\nfuture is uncertain and that there\nwill be challenges and obstacles\nalong the way. However, with hard\nwork, determination, and a positive\nattitude, you can overcome these\nchallenges and achieve your goals. It\nis also important to remember to take\ntime to enjoy the present moment and\nto appreciate the beauty and wonder\nof the world around you.", "model_id": "minigpt4_13b"}
+{"image": "35.png", "question_id": 57, "question": "Can you give me a short introductionto this movie?", "answer": "The Godfather is a classic American\ncrime drama film directed by Francis\nFord Coppola and released in 1972. It\nis based on the novel of the same\nname by Mario Puzo and follows the\nstory of the Corleone family, a\npowerful mafia clan in New York City.\nThe film stars Marlon Brando as the\npatriarch of the family, Vito\nCorleone, and Al Pacino as his son,\nMichael Corleone. The Godfather is\nknown for its iconic performances,\nmemorable dialogue, and its portrayal\nof the American mafia lifestyle. It\nis considered one of the greatest\nfilms of all time and has had a\nlasting impact on popular culture.", "model_id": "minigpt4_13b"}
+{"image": "36.png", "question_id": 58, "question": "Can you give me a detailedintroduction to this painting, andalso the artist?", "answer": "This is a surrealist painting by\nSalvador Dali, titled \"The\nPersistence of Memory.\" It is one of\nDali's most famous works and is\nconsidered one of the greatest works\nof surrealist art. The painting\ndepicts melting clocks and other\nobjects in a desert landscape, with\nthe sun setting in the background.\nThe overall theme of the painting is\nthe idea that time is not fixed and\nthat it can be distorted and\nmanipulated. The painting is\nconsidered a masterpiece of\nsurrealism and is widely recognized\nas one of Dali's most important\nworks.\nThe artist, Salvador Dali, was a\nSpanish surrealist painter and\nsculptor. He is considered one of the\nmost important figures in the history\nof art and is known for his unique\nand imaginative works. Dali was born\nin 1904 in Spain and began painting\nat a young age. He is best known for\nhis surrealist paintings, which often\nfeature melting clocks, distorted\nfigures, and other strange and\nfantastical elements. Dali's work has\nhad a significant influence on the\nart world and continues to be widely\nadmired and studied today.", "model_id": "minigpt4_13b"}
+{"image": "37.png", "question_id": 59, "question": "How should I do to cook this?", "answer": "To cook ramen noodles, you will need\nthe following ingredients:\n1. Ramen noodles\n2. Pork or chicken broth\n3. Soy sauce\n4. Sesame oil\n5. Green onions\n6. Sesame seeds\n7. Sliced pork or chicken\n8. Boiled eggs\nHere's how to cook ramen noodles:\n1. Bring the pork or chicken broth to\na boil in a large pot.\n2. Add the soy sauce, sesame oil, and\ngreen onions to the broth.\n3. Add the ramen noodles to the broth\nand cook according to the package\ninstructions.\n4. While the noodles are cooking,\ncook the sliced pork or chicken in a\nseparate pan.\n5. Once the noodles are cooked, drain\nthem and add them to the pan with the\npork or chicken.\n6. Add the boiled eggs to the pan and\nstir everything together.\n7. Serve the ramen noodles in bowls,\ntopped with sesame seeds and green\nonions.\nI hope this helps! Let me know if you\nhave any other questions.", "model_id": "minigpt4_13b"}
+{"image": "38.png", "question_id": 60, "question": "Explain the advantages of this product.", "answer": "The 2021 Audi e - tron is a luxury electric vehicle that offers a number of advantages over traditional gasoline vehicles. Some of the advantages include:\n\n1. Environmental friendliness: The e - tron is powered by electricity, which produces zero emissions. This makes it an environmentally friendly vehicle that does not contribute to air pollution.\n2. Cost savings: The e - tron is more cost - effective to operate than traditional gasoline vehicles. It has a lower cost of ownership due to its lower fuel costs and reduced maintenance needs.\n3. Performance: The e - tron has a high - performance electric motor that provides a smooth, quiet, and quick acceleration. It also has a regenerative braking system that captures energy and recharges the battery, extending its range.\n4. Technology: The e - tron is equipped with advanced technology features such as a digital instrument cluster, a central infotainment system, and a virtual cockpit. It also has a range of driver assistance features such as adaptive cruise control, lane departure warning, and automatic emergency braking.\n5. Style: The e - tron has a sleek and modern design that is both stylish and functional. It has a spacious and comfortable interior with premium materials and finishes.\n\nOverall, the 2021 Audi", "model_id": "minigpt4_13b"}
+{"image": "39.png", "question_id": 61, "question": "Tell me something about the history of this place.", "answer": "The Great Wall of China is a series of fortifications built across the northern borders of china to protect against invaders. It is one of the most famous landmarks in china and a popular tourist destination. The wall was built over a period of several centuries by various dynasties and is considered one of the seven wonders of the medieval world. It is over 13,000 miles long and is made up of many different sections, each with its own unique characteristics. The wall is an important symbol of china’s history and culture and is a UNESCO World Heritage Site.", "model_id": "minigpt4_13b"}
+{"image": "40.png", "question_id": 62, "question": "Is this photo unusual?", "answer": " Yes, this photo is unusual. The house appears to be upside down, with the roof on the ground and the windows on the top. It’s hard to tell if it’s real or just a trick of the light. \n", "model_id": "minigpt4_13b"}
+{"image": "40.png", "question_id": 63, "question": "How could someone get out of the house?", "answer": " It’s difficult to say without more information about the layout of the house and the surrounding area. However, it’s possible that someone could get out of the house by using the stairs or a ramp that leads to the ground. If the house is upside down, it’s likely that the stairs or ramp would be located on the roof. Alternatively, if the house is right side up, the stairs or ramp would be located on the side or back of the house.\n\nIt’s also possible that the house is just an optical illusion and there is no actual building there. In that case, there would be no way to get out of the house.", "model_id": "minigpt4_13b"}
+{"image": "41.png", "question_id": 64, "question": "What are shown in the photo?", "answer": " The man in the photo is holding a chicken in his arms and looking at it with a surprised expression on his face. The chicken appears to be flapping its wings and making a loud noise. The man is wearing a green shirt and has a look of shock on his face. The background of the photo is a grassy field with trees in the distance. \n", "model_id": "minigpt4_13b"}
+{"image": "41.png", "question_id": 65, "question": "What does the man feel and why?", "answer": " It is not possible to determine what the man in the photo is feeling without more context. The photo only shows the man holding a chicken and looking at it with a surprised expression on his face. It is not clear from the photo what the man’s emotions or thoughts are.", "model_id": "minigpt4_13b"}
+{"image": "42.png", "question_id": 66, "question": " What are the ingredients I need to make this? ", "answer": "To make this pizza, you will need the following ingredients:\n\n* 1 lb. pizza dough\n* 1/2 cup tomato sauce\n* 8 oz. mozzarella cheese, shredded\n* 8 oz. pepperoni slices\n* 1/4 cup shredded mozzarella cheese\n* 1/4 cup grated Parmesan cheese\n* 1 tsp. garlic powder\n* 1 tsp. salt\n* 1 tsp. black pepper\n* 1 tsp. paprika\n* 1 tsp. cayenne pepper\n* 1 tsp. oregano\n* 1 tsp. basil\n* 1 tsp. thyme\n* 1 tsp. rosemary\n* 1 tsp. sage\n* 1 tsp. marjoram\n* 1 tsp. oregano\n* 1 tsp. bay leaves\n* 1 tsp. dried basil\n* 1 tsp. dried thyme\n* 1 tsp. dried rosemary\n* 1 tsp. dried sage\n* 1 tsp. dried marjoram\n* 1 tsp. dried oregano\n* 1 tsp \n", "model_id": "minigpt4_13b"}
+{"image": "42.png", "question_id": 67, "question": " What is the first step?", "answer": "The first step in making this pizza is to prepare the dough. To do this, you will need to combine the flour, salt, and sugar in a large mixing bowl. Stir to combine, then add the olive oil and warm water. Mix until the dough comes together, then knead it on a floured surface until it is smooth and elastic. Cover the dough with plastic wrap and let it rest for at least 30 minutes.", "model_id": "minigpt4_13b"}
+{"image": "43.png", "question_id": 68, "question": " Which city is this? ", "answer": " This is the Merlion statue in Singapore. ", "model_id": "minigpt4_13b"}
+{"image": "43.png", "question_id": 69, "question": " Why do you think so?", "answer": " The image shows a large statue of a mythical creature with the body of a lion and the head of a fish, which is the symbol of Singapore. The statue is located in front of the Marina Bay Sands resort and casino, which is a popular tourist destination in Singapore. The image was taken at sunset, which adds to the beauty of the scene.", "model_id": "minigpt4_13b"}
+{"image": "44.png", "question_id": 70, "question": " What happened at the end of this movie?", "answer": " The movie Titanic is a romantic epic directed by James Cameron and released in 1997. The movie tells the story of the sinking of the RMS Titanic, which occurred on April 14, 1912. The movie follows the story of Jack Dawson, played by Leonardo DiCaprio, and Rose DeWitt Bukater, played by Kate Winslet, as they fall in love on the ship’s ill-fated maiden voyage from Southampton, England, to New York City. The movie also explores the themes of love, loss, and survival. At the end of the movie, the Titanic sinks after hitting an iceberg, resulting in the death of over 1,500 people. Jack and Rose survive, but Jack sacrifices himself so that Rose can live, by allowing her to board a lifeboat while he remains on the ship. The movie ends with Rose looking out at the sunset, with the image of Jack’s body floating in the water behind her. \n", "model_id": "minigpt4_13b"}
+{"image": "44.png", "question_id": 71, "question": " Did Leonardo Dicaprio's character survive?", "answer": "In the movie Titanic, Leonardo DiCaprio’s character, Jack Dawson, does not survive. He dies at the end of the movie after sacrificing himself so that Rose DeWitt Bukater, played by Kate Winslet, can survive. Jack allows Rose to board a lifeboat while he remains on the ship, which is sinking after hitting an iceberg. The movie ends with Rose looking out at the sunset, with the image of Jack’s body floating in the water behind her.", "model_id": "minigpt4_13b"}
+{"image": "45.png", "question_id": 72, "question": " What is in the photo? ", "answer": "The image shows a pizza with a cat face made out of pepperoni and cheese. \n", "model_id": "minigpt4_13b"}
+{"image": "45.png", "question_id": 73, "question": " What is the nose made of?", "answer": "The nose of the cat pizza is made of green onions.", "model_id": "minigpt4_13b"}
+{"image": "46.png", "question_id": 74, "question": " Write a conversation between the two animals.", "answer": "Cat: Hey, dog! What are you doing out here in the snow?\n\nDog: Just taking a walk with my owner. How about you?\n\nCat: Same here. I’m enjoying the fresh air and the snow. It’s so much fun to play in.\n\nDog: Yeah, it is. But be careful not to slip and fall.\n\nCat: Don’t worry, I’ve got my paws on the ground. I’m good at balancing.\n\nDog: That’s good to hear. I’m glad we can enjoy the snow together.\n\nCat: Me too. It’s always great to have a friend to play with in the snow.", "model_id": "minigpt4_13b"}
+{"image": "47.png", "question_id": 75, "question": "Who is the man on the right?", "answer": " The man on the right in the image is likely a famous basketball player Yao Ming. ", "model_id": "minigpt4_13b"}
+{"image": "47.png", "question_id": 76, "question": "Who is the man on the left?", "answer": " The man on the left looks to be Yao Ming's friend, he is a basketball player. ", "model_id": "minigpt4_13b"}
+{"image": "47.png", "question_id": 77, "question": "Who is the taller man?", "answer": "The taller man is the one on the left, as described in the photo. ", "model_id": "minigpt4_13b"}
+{"image": "47.png", "question_id": 78, "question": "Who is wearing red?", "answer": " The friend of Yao Ming is wearing red. ", "model_id": "minigpt4_13b"}
+{"image": "47.png", "question_id": 79, "question": "Who is the man with the water bottle in his hand?", "answer": " The man with the water bottle is most likely the taller man, as he is wearing red.", "model_id": "minigpt4_13b"}
+{"image": "48.png", "question_id": 80, "question": "Introduce the movie character in this picture.", "answer": "The image shows a group of superheroes standing together in a line. They are all wearing their respective costumes, with the Justice\n\nLeague logo on their chests. The background is a rocky terrain with mountains in the distance.\n\n", "model_id": "minigpt4_13b"}
+{"image": "49.png", "question_id": 81, "question": "Analyze the tactical characteristics and lineups of both sides and predict who will win.", "answer": "Argentina and France are two of the strongest teams in the World Cup, with both sides having the potential to win the tournament. Argentina has a strong attacking lineup, with Lionel Messi leading the charge up front and Sergio Aguero providing backup. The midfield is also strong, with Javier Mascherano and Lucas Biglia providing stability and creativity. The defense is led by Nicolas Otamendi and Marcos Rojo, with Gabriel Mercado providing depth.\u000bFrance, on the other hand, has a strong team overall, with a solid defense led by Samuel Umtiti and Raphael Varane. The midfield is also strong, with N’Golo Kante and Paul Pogba providing stability and creativity. The attack is led by Kylian Mbappe and Olivier Giroud, with Antoine Griezmann providing depth.\u000bIn terms of tactics, both sides play a similar style, with a focus on possession and quick transitions. Argentina tends to play a more direct style, while France is more patient and looks to build from the back.\u000bOverall, it’s difficult to predict who will win this match, as both sides have strong players and tactics. However, France may have a slight advantage due to their overall team strength and ability to control the game. I would give France a slight edge and predict a 2-1 victory.\n\n", "model_id": "minigpt4_13b"}
+{"image": "50.png", "question_id": 82, "question": "Tell me about this work of art", "answer": "This is a painting of the starry night by Vincent van Gogh. The painting depicts a night sky with stars shining brightly in the background. In the foreground, there is a small village with a church steeple and a few houses. The painting is done in oil on canvas and has a lot of texture and depth. The colors used are yellow, blue, and white. The overall effect of the painting is one of peace and tranquility.\n", "model_id": "minigpt4_13b"}
diff --git a/models/mPLUG_Owl/OwlEval/answer/openflanmingo_answer.jsonl b/models/mPLUG_Owl/OwlEval/answer/openflanmingo_answer.jsonl
new file mode 100644
index 0000000..a0853ab
--- /dev/null
+++ b/models/mPLUG_Owl/OwlEval/answer/openflanmingo_answer.jsonl
@@ -0,0 +1,82 @@
+{"image": "1.png", "question_id": 1, "question": "What is funny about this image? Describe it panel by panel.", "answer": "It’s a picture of a Nintendo 3DS with a microSDHC card in it. The 3DS has a built-in microSDHC card slot, so why would you need to put a microSDHC card in it? The answer is that the 3DS doesn’t have a built-in microSDHC card slot.", "model_id": "openflanmingo"}
+{"image": "2.png", "question_id": 2, "question": "Why would a person find this image funny?", "answer": "Because the squirrel is holding a gun.", "model_id": "openflanmingo"}
+{"image": "3.png", "question_id": 3, "question": "What is unusual about this image?", "answer": "The laundry is being dried on the roof of a New York City taxi cab.", "model_id": "openflanmingo"}
+{"image": "4.png", "question_id": 4, "question": "Is there text in the image? If so, can you read it and explain it to me? ", "answer": " Yes, there is text in the image. It says, “I don’t know if I’ll ever get over you, but I know I’ll never forget you.” It’s from a comic strip called “The Oatmeal” by Matthew Inman. It’s a funny comic, but it’s also a great example of how to use text in your images. It’s a great example of how to use text in your images. ", "model_id": "openflanmingo"}
+{"image": "4.png", "question_id": 5, "question": "What is funny about it?", "answer": " It’s a funny comic, but it’s also a great example of how to use text in your images. It’s a great example of how to use text in your images.", "model_id": "openflanmingo"}
+{"image": "5.png", "question_id": 6, "question": "Where is the frisbee in the image? ", "answer": " The frisbee is in the dog's mouth. ", "model_id": "openflanmingo"}
+{"image": "5.png", "question_id": 7, "question": "Where is the man in the image?", "answer": " The man is on the left side of the image. ", "model_id": "openflanmingo"}
+{"image": "5.png", "question_id": 8, "question": "What is the spatial relation between the frisbee and the man?", "answer": " The frisbee is in front of the man.", "model_id": "openflanmingo"}
+{"image": "6.png", "question_id": 9, "question": "What should I do after kneading the dough, before dividing it into pieces?", "answer": " 1. Let the dough rest for 10 minutes. 2. Divide the dough into pieces. 3. Let the pieces rest for 10 minutes. ", "model_id": "openflanmingo"}
+{"image": "6.png", "question_id": 10, "question": "How long should I put in warm places?", "answer": " 1. Let the dough rest for 10 minutes. 2. Divide the dough into pieces. 3. Let the pieces rest for 10 minutes.", "model_id": "openflanmingo"}
+{"image": "7.png", "question_id": 11, "question": "What is the total amount?", "answer": "40.00 USD.", "model_id": "openflanmingo"}
+{"image": "8.png", "question_id": 12, "question": "Excluding the regular cab, what is the average total fueling cost?", "answer": "2017 Ford F-150 Regular Cab.", "model_id": "openflanmingo"}
+{"image": "9.png", "question_id": 13, "question": "Excluding Ford F150 Lightning, what is the average total charging cost? ", "answer": " 13 cents per mile. ", "model_id": "openflanmingo"}
+{"image": "9.png", "question_id": 14, "question": "Now how much cheaper is the average charging cost over the electric cars cluding Ford F150 Lightning) than the average fueling cost over the gasoline cars cluding Ford F150 gular Cab))?Let's think step-by-step.", "answer": " 13 cents per mile.", "model_id": "openflanmingo"}
+{"image": "10.png", "question_id": 15, "question": "How many bedrooms are there in this floor plan?", "answer": "模型放弃回答", "model_id": "openflanmingo"}
+{"image": "10.png", "question_id": 16, "question": "What is the dimension of Master Bedroom？", "answer": "模型放弃回答", "model_id": "openflanmingo"}
+{"image": "10.png", "question_id": 17, "question": "Is there a bathroom in master bedroom?", "answer": "模型放弃回答", "model_id": "openflanmingo"}
+{"image": "10.png", "question_id": 18, "question": "What is the dimension of Second Bedroom？", "answer": "模型放弃回答", "model_id": "openflanmingo"}
+{"image": "10.png", "question_id": 19, "question": "Is the terrace bigger than the kitchen?", "answer": "模型放弃回答", "model_id": "openflanmingo"}
+{"image": "10.png", "question_id": 20, "question": "What are the appliances in the kitchen in this floorplan？", "answer": "模型放弃回答", "model_id": "openflanmingo"}
+{"image": "10.png", "question_id": 21, "question": "What will you see when you enter through the entrance? and what is the dimension of it？", "answer": "模型放弃回答", "model_id": "openflanmingo"}
+{"image": "10.png", "question_id": 22, "question": "Can you summarize this floorplan based on the aspects I was interested in, in a few sentences？", "answer": "模型放弃回答", "model_id": "openflanmingo"}
+{"image": "11.png", "question_id": 23, "question": "Can you read the text in this flowchart and explain the process in detail?", "answer": " 1. The text is a flowchart. 2. The flowchart shows the process of reading the text. ", "model_id": "openflanmingo"}
+{"image": "11.png", "question_id": 24, "question": "Which step in the flowchart is drawn with a diamond shape?", "answer": " 1. The text is a flowchart. 2. The flowchart shows the process of reading the text.", "model_id": "openflanmingo"}
+{"image": "12.png", "question_id": 25, "question": "Can you read the text in this flowchart and explain it to me?", "answer": "1. Boil water for tea. 2. Wait for tea to cool. 3. Pour tea into kettle. 4. Boil water for toilet. 5. Wait for toilet to cool. ", "model_id": "openflanmingo"}
+{"image": "12.png", "question_id": 26, "question": "Can you transform this flowchart into a simple program in python?", "answer": " 1. Boil water for tea. 2. Wait for tea to cool. 3. Pour tea into kettle. 4. Boil water for toilet. 5. Wait for toilet to cool.", "model_id": "openflanmingo"}
+{"image": "13.png", "question_id": 27, "question": "How much is Wilkins earning?", "answer": " 10,000. ", "model_id": "openflanmingo"}
+{"image": "13.png", "question_id": 28, "question": "Which department is the person who has the highest salary from?", "answer": " 10,000. ", "model_id": "openflanmingo"}
+{"image": "13.png", "question_id": 29, "question": "This table is reporting monthly salary,can you calculate the yearly salary for each employee, and summarize it into a table and group them by department?", "answer": " 10,000. ", "model_id": "openflanmingo"}
+{"image": "13.png", "question_id": 30, "question": "Given the yearly bonus is 10%,12% and 15%for Administration,Finance,Marketing, respectively. Please provide me with the bonus for Emily, Max and Gloria.", "answer": " 10,000.", "model_id": "openflanmingo"}
+{"image": "14.png", "question_id": 31, "question": " Answer question I.1.a. Think step-by-step.", "answer": "1.a. 1.b. 1.c. 1.d. 1.e. 1.f. 1.g. 1.h. 1.i. 1.j.", "model_id": "openflanmingo"}
+{"image": "15.png", "question_id": 32, "question": " Can you write a template for this website in HTML? Replace the joke and punchline with a real one.", "answer": "yes", "model_id": "openflanmingo"}
+{"image": "16.png", "question_id": 33, "question": "How many championship rings did the player on the left win in his career?", "answer": "Kobe Bryant.", "model_id": "openflanmingo"}
+{"image": "17.png", "question_id": 34, "question": "Describe this image. ", "answer": " Night time is when I'm most awake. Saturday night is when I'm most awake. ", "model_id": "openflanmingo"}
+{"image": "17.png", "question_id": 35, "question": "Explain why this image is funny.", "answer": " Saturday night is when I'm most awake.", "model_id": "openflanmingo"}
+{"image": "18.png", "question_id": 36, "question": "Describe this image as detailed aspossible.", "answer": "1. What is the name of this city? 2. What is the name of this city? 3. What is the name of this city? 4. What is the name of this city?", "model_id": "openflanmingo"}
+{"image": "19.png", "question_id": 37, "question": "Describe this image as detailed aspossible.", "answer": "20180922180000", "model_id": "openflanmingo"}
+{"image": "20.png", "question_id": 38, "question": "Explain why this meme is funny.", "answer": "It's funny because dogs don't have a concept of time. They don't know what the word \"later\" means. They don't know what the word \"tomorrow\" means.", "model_id": "openflanmingo"}
+{"image": "21.png", "question_id": 39, "question": "Describe the image ", "answer": " This is a painting of a mother wolf and her cub. The mother wolf is looking down at her cub and the cub is looking up at the mother wolf. ", "model_id": "openflanmingo"}
+{"image": "21.png", "question_id": 40, "question": "Can this happen in real world?", "answer": " Yes, this can happen in the real world.", "model_id": "openflanmingo"}
+{"image": "22.png", "question_id": 41, "question": "Describe the image ", "answer": " This is a picture of a cactus in the middle of a frozen lake.", "model_id": "openflanmingo"}
+{"image": "22.png", "question_id": 42, "question": "Is this image common in real world?", "answer": " Yes, this can happen in the real world.", "model_id": "openflanmingo"}
+{"image": "23.png", "question_id": 43, "question": "Write brief HTML/JS to turn thismock-up into a colorful website,where the jokes are replaced by areal joke", "answer": "Yes.", "model_id": "openflanmingo"}
+{"image": "24.png", "question_id": 44, "question": "What is the issue with my plant? Whatshould I do now?", "answer": "100% of the leaves on my plant are turning yellow and falling off. The plant is about 10 years old and has been in the same spot for about 5 years.", "model_id": "openflanmingo"}
+{"image": "25.png", "question_id": 45, "question": "Why this happens and how to fix it?", "answer": "100% of the time this is caused by a clogged drain in the washing machine.", "model_id": "openflanmingo"}
+{"image": "26.png", "question_id": 46, "question": "Can you write me a beautiful poemabout this image?", "answer": "100% yes.", "model_id": "openflanmingo"}
+{"image": "27.png", "question_id": 47, "question": "Can you write me a master rap songthat rhymes very well based on thisimage?", "answer": "100% yes.", "model_id": "openflanmingo"}
+{"image": "28.png", "question_id": 48, "question": "Can you write a story around thisimage for my child?", "answer": "模型拒绝回答", "model_id": "openflanmingo"}
+{"image": "29.png", "question_id": 49, "question": "I designed a character for my game asshown in the image. Can you write mea background story for him?", "answer": "The character is a monk.", "model_id": "openflanmingo"}
+{"image": "30.png", "question_id": 50, "question": "We design new mugs shown in theimage. Can you help us write anadvertisement?", "answer": "1. We can help you write an advertisement. 2. We can help you design a new mug.", "model_id": "openflanmingo"}
+{"image": "31.png", "question_id": 51, "question": "Help me write an advertisement forthis.", "answer": "100% of the proceeds from the sale of this item will go to the National Audubon Society to help protect birds and their habitats.", "model_id": "openflanmingo"}
+{"image": "32.png", "question_id": 52, "question": "Who is the person in the image?", "answer": "Donald Glover", "model_id": "openflanmingo"}
+{"image": "33.png", "question_id": 53, "question": "Describe this image. ", "answer": " A man is running across the street. He is wearing a suit. ", "model_id": "openflanmingo"}
+{"image": "33.png", "question_id": 54, "question": "What do you think about the person'sbehavior?", "answer": " 1. I think that the person's behavior is dangerous. 2. I think that the person's behavior is funny. 3. I think that the person's behavior is strange.", "model_id": "openflanmingo"}
+{"image": "34.png", "question_id": 55, "question": "Describe about the image ", "answer": " This is a picture of a little boy looking out the window of a skyscraper. He is looking down at the city below. He is wearing a red jacket and blue jeans. ", "model_id": "openflanmingo"}
+{"image": "34.png", "question_id": 56, "question": "Do you have anything to say to theyoung boy in the image?", "answer": " 1. I would tell him that he is very lucky to live in such a big city. 2. I would tell him that he is very lucky to be able to look out the window of a skyscraper.", "model_id": "openflanmingo"}
+{"image": "35.png", "question_id": 57, "question": "Can you give me a short introductionto this movie?", "answer": "1992’s “JFK” was directed by Oliver Stone and starred Kevin Costner as New Orleans District Attorney Jim Garrison. The film was based on the book “On the Trail of the Assassins”.", "model_id": "openflanmingo"}
+{"image": "36.png", "question_id": 58, "question": "Can you give me a detailedintroduction to this painting, andalso the artist?", "answer": "1914-1915, Salvador Dali, The Persistence of Memory, oil on canvas, 24 x 30 in. ", "model_id": "openflanmingo"}
+{"image": "37.png", "question_id": 59, "question": "How should I do to cook this?", "answer": "1. Bring the water to a boil. 2. Add the noodles and cook for 3 minutes. 3. Drain the noodles and rinse with cold water. 4. Put the noodles", "model_id": "openflanmingo"}
+{"image": "38.png", "question_id": 60, "question": "Explain the advantages of this product.", "answer": "2019 Audi RS 5 Sportback is the first-ever RS 5 Sportback in the U.S. market. The RS 5 Sportback combines the elegance of a four-door coupe with the practicality of a five-door hatchback. The RS 5 Sportback is powered by a 2.9-liter TFSI® twin-turbo V6 engine that produces 444 hp and 443 lb-ft of torque. The RS 5 Sportback accelerates from 0 to 60 mph in 3.7 seconds and has a top track speed of 174 mph.", "model_id": "openflanmingo"}
+{"image": "39.png", "question_id": 61, "question": "Tell me something about the history of this place.", "answer": "The Great Wall of China is a series of fortifications made of stone, brick, tamped earth, wood, and other materials, generally built along an east-to-west line across the historical northern borders of China to protect the Chinese states and empires against the raids and invasions of the various nomadic groups of the Eurasian Steppe. Several walls were being built as early as the 7th century BC; these, later joined together and made bigger and stronger,", "model_id": "openflanmingo"}
+{"image": "40.png", "question_id": 62, "question": "Is this photo unusual?", "answer": " 100% of the time, yes. ", "model_id": "openflanmingo"}
+{"image": "40.png", "question_id": 63, "question": "How could someone get out of the house?", "answer": " 100% of the time, they can't.", "model_id": "openflanmingo"}
+{"image": "41.png", "question_id": 64, "question": "What are shown in the photo?", "answer": " 2 chickens. ", "model_id": "openflanmingo"}
+{"image": "41.png", "question_id": 65, "question": "What does the man feel and why?", "answer": " 2 chickens.", "model_id": "openflanmingo"}
+{"image": "42.png", "question_id": 66, "question": " What are the ingredients I need to make this? ", "answer": " 1/2 cups of flour, 1/2 cup of butter, 1/2 cup of milk, 1/2 cup of water, 1 tsp of salt, 1 tsp of sugar, 1/2 tsp of baking powder, 1/2 tsp of baking soda, 1/2 tsp of garlic powder, 1/2 tsp of onion powder, 1/2 tsp of oregano, 1/2 tsp of basil, 1/2 tsp of parsley, 1/2 tsp of black pepper, 1/2 tsp of cayenne pepper, 1/2 tsp of cinnamon, 1/2 tsp of nutmeg, 1/2 tsp of allspice, 1/2 tsp of cloves. ", "model_id": "openflanmingo"}
+{"image": "42.png", "question_id": 67, "question": " What is the first step?", "answer": " 1/2 cups of flour, 1/2 cup of butter, 1/2 cup of milk, 1/2 cup of water, 1 tsp of salt, 1 tsp of sugar, 1/2 tsp of baking powder, 1/2 tsp of baking soda, 1/2 tsp of garlic powder, 1/2 tsp of onion powder, 1/2 tsp of oregano, 1/2 tsp of basil, 1/2 tsp of parsley, 1/2 tsp of black pepper, 1/2 tsp of cayenne pepper, 1/2 tsp of cinnamon, 1/2 tsp of nutmeg, 1/2 tsp of allspice, 1/2 tsp of cloves.", "model_id": "openflanmingo"}
+{"image": "43.png", "question_id": 68, "question": " Which city is this? ", "answer": " Marina Bay Sands, Singapore. ", "model_id": "openflanmingo"}
+{"image": "43.png", "question_id": 69, "question": " Why do you think so?", "answer": " The Marina Bay Sands is a luxury hotel, casino and shopping mall in Singapore. 2. The Marina Bay Sands is the world's most expensive standalone casino property", "model_id": "openflanmingo"}
+{"image": "44.png", "question_id": 70, "question": " What happened at the end of this movie?", "answer": " 2000's Titanic. ", "model_id": "openflanmingo"}
+{"image": "44.png", "question_id": 71, "question": " Did Leonardo Dicaprio's character survive?", "answer": " 2000's Titanic.", "model_id": "openflanmingo"}
+{"image": "45.png", "question_id": 72, "question": " What is in the photo? ", "answer": " A cat made out of bread and cheese. ", "model_id": "openflanmingo"}
+{"image": "45.png", "question_id": 73, "question": " What is the nose made of?", "answer": " Cheese.", "model_id": "openflanmingo"}
+{"image": "46.png", "question_id": 74, "question": "Write a conversation between the two animals.", "answer": "Hello!", "model_id": "openflanmingo"}
+{"image": "47.png", "question_id": 75, "question": "Who is the man on the right?", "answer": " 2008 Beijing Olympics gold medalist Yao Ming. ", "model_id": "openflanmingo"}
+{"image": "47.png", "question_id": 76, "question": "Who is the man on the left?", "answer": " 2008 Olympic gold medalist Yao Ming ", "model_id": "openflanmingo"}
+{"image": "47.png", "question_id": 77, "question": "Who is the taller man?", "answer": " Jackie Chan. ", "model_id": "openflanmingo"}
+{"image": "47.png", "question_id": 78, "question": "Who is wearing red?", "answer": " �������������������������������������������������� ", "model_id": "openflanmingo"}
+{"image": "47.png", "question_id": 79, "question": "Who is the man with the water bottle in his hand?", "answer": " 2008 Beijing Olympics gold medalist Yao Ming", "model_id": "openflanmingo"}
+{"image": "48.png", "question_id": 80, "question": "Introduce the movie character in this picture.", "answer": "1. Batman v Superman: Dawn of Justice 2. Captain America: Civil War 3. Suicide Squad 4. Deadpool 5. X-Men: Apocalypse 6. Doctor Strange", "model_id": "openflanmingo"}
+{"image": "49.png", "question_id": 81, "question": "Analyze the tactical characteristics and lineups of both sides and predict who will win.", "answer": "2", "model_id": "openflanmingo"}
+{"image": "50.png", "question_id": 82, "question": "Tell me about this work of art", "answer": "This is a painting by Vincent van Gogh called Starry Night Over the Rhone. It was painted in 1888.", "model_id": "openflanmingo"}
diff --git a/models/mPLUG_Owl/OwlEval/cases/1.jpg b/models/mPLUG_Owl/OwlEval/cases/1.jpg
new file mode 100644
index 0000000..69984e5
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/1.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/10.jpg b/models/mPLUG_Owl/OwlEval/cases/10.jpg
new file mode 100644
index 0000000..0d8c564
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/10.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/11.jpg b/models/mPLUG_Owl/OwlEval/cases/11.jpg
new file mode 100644
index 0000000..a2f7f1f
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/11.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/12.jpg b/models/mPLUG_Owl/OwlEval/cases/12.jpg
new file mode 100644
index 0000000..8a1d975
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/12.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/13.jpg b/models/mPLUG_Owl/OwlEval/cases/13.jpg
new file mode 100644
index 0000000..1de4e1e
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/13.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/14.jpg b/models/mPLUG_Owl/OwlEval/cases/14.jpg
new file mode 100644
index 0000000..e412ab0
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/14.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/15.jpg b/models/mPLUG_Owl/OwlEval/cases/15.jpg
new file mode 100644
index 0000000..967c8d3
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/15.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/16.jpg b/models/mPLUG_Owl/OwlEval/cases/16.jpg
new file mode 100644
index 0000000..d040c12
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/16.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/17.jpg b/models/mPLUG_Owl/OwlEval/cases/17.jpg
new file mode 100644
index 0000000..c0f6e89
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/17.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/18.jpg b/models/mPLUG_Owl/OwlEval/cases/18.jpg
new file mode 100644
index 0000000..90ce7cb
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/18.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/19.jpg b/models/mPLUG_Owl/OwlEval/cases/19.jpg
new file mode 100644
index 0000000..2d50bd7
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/19.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/2.jpg b/models/mPLUG_Owl/OwlEval/cases/2.jpg
new file mode 100644
index 0000000..ca12321
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/2.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/20.jpg b/models/mPLUG_Owl/OwlEval/cases/20.jpg
new file mode 100644
index 0000000..63493fb
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/20.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/21.jpg b/models/mPLUG_Owl/OwlEval/cases/21.jpg
new file mode 100644
index 0000000..cc50c95
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/21.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/22.jpg b/models/mPLUG_Owl/OwlEval/cases/22.jpg
new file mode 100644
index 0000000..5e7b334
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/22.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/23.jpg b/models/mPLUG_Owl/OwlEval/cases/23.jpg
new file mode 100644
index 0000000..246e29a
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/23.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/24.jpg b/models/mPLUG_Owl/OwlEval/cases/24.jpg
new file mode 100644
index 0000000..5feb6d6
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/24.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/25.jpg b/models/mPLUG_Owl/OwlEval/cases/25.jpg
new file mode 100644
index 0000000..821f0dd
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/25.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/26.jpg b/models/mPLUG_Owl/OwlEval/cases/26.jpg
new file mode 100644
index 0000000..48ad9a0
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/26.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/27.jpg b/models/mPLUG_Owl/OwlEval/cases/27.jpg
new file mode 100644
index 0000000..cbc2f15
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/27.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/28.jpg b/models/mPLUG_Owl/OwlEval/cases/28.jpg
new file mode 100644
index 0000000..8eefaa2
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/28.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/29.jpg b/models/mPLUG_Owl/OwlEval/cases/29.jpg
new file mode 100644
index 0000000..9ffc4ad
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/29.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/3.jpg b/models/mPLUG_Owl/OwlEval/cases/3.jpg
new file mode 100644
index 0000000..794a56c
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/3.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/30.jpg b/models/mPLUG_Owl/OwlEval/cases/30.jpg
new file mode 100644
index 0000000..a9651dc
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/30.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/31.jpg b/models/mPLUG_Owl/OwlEval/cases/31.jpg
new file mode 100644
index 0000000..db184ec
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/31.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/32.jpg b/models/mPLUG_Owl/OwlEval/cases/32.jpg
new file mode 100644
index 0000000..7d46176
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/32.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/33.jpg b/models/mPLUG_Owl/OwlEval/cases/33.jpg
new file mode 100644
index 0000000..fe25de3
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/33.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/34.jpg b/models/mPLUG_Owl/OwlEval/cases/34.jpg
new file mode 100644
index 0000000..6d58f6f
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/34.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/35.jpg b/models/mPLUG_Owl/OwlEval/cases/35.jpg
new file mode 100644
index 0000000..fef4de9
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/35.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/36.jpg b/models/mPLUG_Owl/OwlEval/cases/36.jpg
new file mode 100644
index 0000000..a520500
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/36.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/37.jpg b/models/mPLUG_Owl/OwlEval/cases/37.jpg
new file mode 100644
index 0000000..6139b2d
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/37.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/38.jpg b/models/mPLUG_Owl/OwlEval/cases/38.jpg
new file mode 100644
index 0000000..6cfc778
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/38.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/39.jpg b/models/mPLUG_Owl/OwlEval/cases/39.jpg
new file mode 100644
index 0000000..f3b79eb
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/39.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/4.jpg b/models/mPLUG_Owl/OwlEval/cases/4.jpg
new file mode 100644
index 0000000..d30900d
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/4.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/40.jpg b/models/mPLUG_Owl/OwlEval/cases/40.jpg
new file mode 100644
index 0000000..7870b83
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/40.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/41.jpg b/models/mPLUG_Owl/OwlEval/cases/41.jpg
new file mode 100644
index 0000000..527ddd9
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/41.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/42.jpg b/models/mPLUG_Owl/OwlEval/cases/42.jpg
new file mode 100644
index 0000000..abe25cd
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/42.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/43.jpg b/models/mPLUG_Owl/OwlEval/cases/43.jpg
new file mode 100644
index 0000000..76d18a9
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/43.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/44.jpg b/models/mPLUG_Owl/OwlEval/cases/44.jpg
new file mode 100644
index 0000000..2380802
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/44.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/45.jpg b/models/mPLUG_Owl/OwlEval/cases/45.jpg
new file mode 100644
index 0000000..c490bf5
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/45.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/46.jpg b/models/mPLUG_Owl/OwlEval/cases/46.jpg
new file mode 100644
index 0000000..07569a9
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/46.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/47.jpg b/models/mPLUG_Owl/OwlEval/cases/47.jpg
new file mode 100644
index 0000000..51b9ff2
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/47.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/48.jpg b/models/mPLUG_Owl/OwlEval/cases/48.jpg
new file mode 100644
index 0000000..ba9eac1
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/48.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/49.jpg b/models/mPLUG_Owl/OwlEval/cases/49.jpg
new file mode 100644
index 0000000..dc4c671
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/49.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/5.jpg b/models/mPLUG_Owl/OwlEval/cases/5.jpg
new file mode 100644
index 0000000..c40a39a
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/5.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/50.jpg b/models/mPLUG_Owl/OwlEval/cases/50.jpg
new file mode 100644
index 0000000..f23f054
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/50.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/6.jpg b/models/mPLUG_Owl/OwlEval/cases/6.jpg
new file mode 100644
index 0000000..c7c1f10
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/6.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/7.jpg b/models/mPLUG_Owl/OwlEval/cases/7.jpg
new file mode 100644
index 0000000..9fb68b0
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/7.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/8.jpg b/models/mPLUG_Owl/OwlEval/cases/8.jpg
new file mode 100644
index 0000000..a12ec55
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/8.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/cases/9.jpg b/models/mPLUG_Owl/OwlEval/cases/9.jpg
new file mode 100644
index 0000000..6e71c58
Binary files /dev/null and b/models/mPLUG_Owl/OwlEval/cases/9.jpg differ
diff --git a/models/mPLUG_Owl/OwlEval/questions.jsonl b/models/mPLUG_Owl/OwlEval/questions.jsonl
new file mode 100644
index 0000000..f3dd876
--- /dev/null
+++ b/models/mPLUG_Owl/OwlEval/questions.jsonl
@@ -0,0 +1,82 @@
+{"image": "1.jpg", "question_id": 1, "question": "What is funny about this image? Describe it panel by panel."}
+{"image": "2.jpg", "question_id": 2, "question": "Why would a person find this image funny?"}
+{"image": "3.jpg", "question_id": 3, "question": "What is unusual about this image"}
+{"image": "4.jpg", "question_id": 4, "question": "Is there text in the image? If so, can you read it and explain it to me?"}
+{"image": "4.jpg", "question_id": 5, "question": "What is funny about it?"}
+{"image": "5.jpg", "question_id": 6, "question": "Where is the frisbee in the image?"}
+{"image": "5.jpg", "question_id": 7, "question": "Where is the man in the image?"}
+{"image": "5.jpg", "question_id": 8, "question": "What is the spatial relation between the frisbee and the man?"}
+{"image": "6.jpg", "question_id": 9, "question": "What should I do after kneading the dough, before dividing it into pieces?"}
+{"image": "6.jpg", "question_id": 10, "question": "How long should I put in warm places?"}
+{"image": "7.jpg", "question_id": 11, "question": "What is the total amount?"}
+{"image": "8.jpg", "question_id": 12, "question": "Excluding the regular cab, what is the average total fueling cost?"}
+{"image": "9.jpg", "question_id": 13, "question": "Excluding Ford F150 Lightning, what is the average total charging cost?"}
+{"image": "9.jpg", "question_id": 14, "question": "Now how much cheaper is the average charging cost over the electric cars (including Ford F150 Lightning) than the average fueling cost over the gasoline cars (including Ford F150(Regular Cab))?Let's think step-by-step."}
+{"image": "10.jpg", "question_id": 15, "question": "How many bedrooms are there in this floor plan?"}
+{"image": "10.jpg", "question_id": 16, "question": "What is the dimension of Master Bedroom"}
+{"image": "10.jpg", "question_id": 17, "question": "Is there a bathroom in master bedroom?"}
+{"image": "10.jpg", "question_id": 18, "question": "What is the dimension of Second Bedroom?"}
+{"image": "10.jpg", "question_id": 19, "question": "Is the terrace bigger than the kitchen?"}
+{"image": "10.jpg", "question_id": 20, "question": "What are the appliances in the kitchen in this floorplan"}
+{"image": "10.jpg", "question_id": 21, "question": "What will you see when you enter through the entrance? and what is the dimension of it?"}
+{"image": "10.jpg", "question_id": 22, "question": "Can you summarize this floorplan based on the aspects I was interested in, in a few sentences?"}
+{"image": "11.jpg", "question_id": 23, "question": "Can you read the text in this flowchart and explain the process in detail?"}
+{"image": "11.jpg", "question_id": 24, "question": "Which step in the flowchart is drawn with a diamond shape?"}
+{"image": "12.jpg", "question_id": 25, "question": "Can you read the text in this flowchart and explain it to me?"}
+{"image": "12.jpg", "question_id": 26, "question": "Can you transform this flowchart into a simple program in python?"}
+{"image": "13.jpg", "question_id": 27, "question": "How much is Wilkins earning?"}
+{"image": "13.jpg", "question_id": 28, "question": "Which department is the person who has the highest salary from?"}
+{"image": "13.jpg", "question_id": 29, "question": "This table is reporting monthly salary,can you calculate the yearly salary for each employee, and summarize it into a table and group them by department?"}
+{"image": "13.jpg", "question_id": 30, "question": "Given the yearly bonus is 10%,12% and 15%for Administration,Finance,Marketing, respectively. Please provide me with the bonus for Emily, Max and Gloria."}
+{"image": "14.jpg", "question_id": 31, "question": "Answer question I.1.a. Think step-by-step."}
+{"image": "15.jpg", "question_id": 32, "question": "Can you write a template for this website in HTML? Replace the joke and punchline with a real one."}
+{"image": "16.jpg", "question_id": 33, "question": "How many championship rings did the player on the left win in his career?"}
+{"image": "17.jpg", "question_id": 34, "question": "Describe this image."}
+{"image": "17.jpg", "question_id": 35, "question": "Explain why this image is funny."}
+{"image": "18.jpg", "question_id": 36, "question": "Describe this image as detailed as possible."}
+{"image": "19.jpg", "question_id": 37, "question": "Describe this image as detailed as possible."}
+{"image": "20.jpg", "question_id": 38, "question": "Explain why this meme is funny."}
+{"image": "21.jpg", "question_id": 39, "question": "Describe the image."}
+{"image": "21.jpg", "question_id": 40, "question": "Can this happen in real world?"}
+{"image": "22.jpg", "question_id": 41, "question": "Describe the image."}
+{"image": "22.jpg", "question_id": 42, "question": "Is this image common in real world?"}
+{"image": "23.jpg", "question_id": 43, "question": "Write brief HTML/JS to turn this mock-up into a colorful website, where the jokes are replaced by a real joke."}
+{"image": "24.jpg", "question_id": 44, "question": "What is the issue with my plant? What should I do now?"}
+{"image": "25.jpg", "question_id": 45, "question": "Why this happens and how to fix it?"}
+{"image": "26.jpg", "question_id": 46, "question": "Can you write me a beautiful poem about this image?"}
+{"image": "27.jpg", "question_id": 47, "question": "Can you write me a master rap song that rhymes very well based on this image?"}
+{"image": "28.jpg", "question_id": 48, "question": "Can you write a story around this image for my child?"}
+{"image": "29.jpg", "question_id": 49, "question": "I designed a character for my game as shown in the image. Can you write me a background story for him?"}
+{"image": "30.jpg", "question_id": 50, "question": "We design new mugs shown in the image. Can you help us write an advertisement?"}
+{"image": "31.jpg", "question_id": 51, "question": "Help me write an advertisement for this."}
+{"image": "32.jpg", "question_id": 52, "question": "Who is the person in the image?"}
+{"image": "33.jpg", "question_id": 53, "question": "Describe this image."}
+{"image": "33.jpg", "question_id": 54, "question": "What do you think about the person's behavior?"}
+{"image": "34.jpg", "question_id": 55, "question": "Describe about the image."}
+{"image": "34.jpg", "question_id": 56, "question": "Do you have anything to say to the young boy in the image?"}
+{"image": "35.jpg", "question_id": 57, "question": "Can you give me a short introduction to this movie?"}
+{"image": "36.jpg", "question_id": 58, "question": "Can you give me a detailed introduction to this painting, and also the artist?"}
+{"image": "37.jpg", "question_id": 59, "question": "How should I do to cook this?"}
+{"image": "38.jpg", "question_id": 60, "question": "Explain the advantages of this product."}
+{"image": "39.jpg", "question_id": 61, "question": "Tell me something about the history of this place."}
+{"image": "40.jpg", "question_id": 62, "question": "Is this photo unusual?"}
+{"image": "40.jpg", "question_id": 63, "question": "How could someone get out of the house? "}
+{"image": "41.jpg", "question_id": 64, "question": "What are shown in the photo?"}
+{"image": "41.jpg", "question_id": 65, "question": "What does the man feel and why? "}
+{"image": "42.jpg", "question_id": 66, "question": "What are the ingredients I need to make this?"}
+{"image": "42.jpg", "question_id": 67, "question": "What is the first step?"}
+{"image": "43.jpg", "question_id": 68, "question": "Which city is this?"}
+{"image": "43.jpg", "question_id": 69, "question": "Why do you think so?"}
+{"image": "44.jpg", "question_id": 70, "question": "What happened at the end of this movie?"}
+{"image": "44.jpg", "question_id": 71, "question": "Did Leonardo Dicaprio's character survive?"}
+{"image": "45.jpg", "question_id": 72, "question": "What is in the photo?"}
+{"image": "45.jpg", "question_id": 73, "question": "What is the nose made of?"}
+{"image": "46.jpg", "question_id": 74, "question": "Write a conversation between the two animals."}
+{"image": "47.jpg", "question_id": 75, "question": "Who is the man on the right?"}
+{"image": "47.jpg", "question_id": 76, "question": "Who is the man on the left?"}
+{"image": "47.jpg", "question_id": 77, "question": "Who is the taller man?"}
+{"image": "47.jpg", "question_id": 78, "question": "Who is wearing red?"}
+{"image": "47.jpg", "question_id": 79, "question": "Who is the man with the water bottle in his hand?"}
+{"image": "48.jpg", "question_id": 80, "question": "Introduce the movie character in this picture."}
+{"image": "49.jpg", "question_id": 81, "question": "Analyze the tactical characteristics and lineups of both sides and predict who will win."}
+{"image": "50.jpg", "question_id": 82, "question": "Tell me about this work of art."}
diff --git a/models/mPLUG_Owl/README.md b/models/mPLUG_Owl/README.md
new file mode 100644
index 0000000..556a56f
--- /dev/null
+++ b/models/mPLUG_Owl/README.md
@@ -0,0 +1,284 @@
+<div align="center">
+<img src="assets/mPLUG_new1.png" width="80%">
+</div>
+
+# mPLUG-Owl🦉: Modularization Empowers Large Language Models with Multimodality
+<div align="center">
+Qinghao Ye*, Haiyang Xu*, Guohai Xu*, Jiabo Ye, Ming Yan†, Yiyang Zhou, Junyang Wang, Anwen Hu, Pengcheng Shi, Yaya Shi, Chaoya Jiang, Chenliang Li, Yuanhong Xu, Hehong Chen, Junfeng Tian, Qian Qi, Ji Zhang, Fei Huang
+</div>
+<div align="center">
+<strong>DAMO Academy, Alibaba Group</strong>
+</div>
+<div align="center">
+*Equal Contribution; † Corresponding Author
+</div>
+
+<div align="center">
+    <a href="https://huggingface.co/spaces/MAGAer13/mPLUG-Owl"><img src="https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm-dark.svg" alt="Open in Spaces"></a>
+    <a href="https://modelscope.cn/studios/damo/mPLUG-Owl/summary"><img src="assets/Demo-ModelScope-brightgreen.svg" alt="Demo ModelScope"></a>
+    <a href="https://replicate.com/joehoover/mplug-owl"><img src="https://replicate.com/replicate/mplug-owl/badge" alt="Run with Replicate"></a>
+    <a href="https://github.com/X-PLUG/mPLUG-Owl/blob/main/LICENSE"><img src="assets/LICENSE-Apache%20License-blue.svg" alt="License"></a>
+    <a href="http://mm-chatgpt.oss-cn-zhangjiakou.aliyuncs.com/mplug_owl_demo/released_checkpoint/mPLUG_Owl_paper.pdf"><img src="assets/Paper-PDF-orange.svg"></a>
+    <a href="https://arxiv.org/abs/2304.14178"><img src="assets/Paper-Arxiv-orange.svg" ></a>
+    <a href="https://hits.seeyoufarm.com"><img src="https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FX-PLUG%2FmPLUG-Owl&count_bg=%23E97EBA&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=visitors&edge_flat=false" alt="Hits"></a>
+    <a href="https://twitter.com/xuhaiya2483846/status/1654640739010351106"><img src='assets/-twitter-blue.svg'></a>
+</div>
+<!--
+[![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm-dark.svg)](https://huggingface.co/spaces/MAGAer13/mPLUG-Owl)
+[![](assets/Demo-ModelScope-brightgreen.svg)](https://modelscope.cn/studios/damo/mPLUG-Owl/summary)
+[![](assets/LICENSE-Apache%20License-blue.svg)](https://github.com/X-PLUG/mPLUG-Owl/blob/main/LICENSE)
+[![](assets/Paper-PDF-orange.svg)](http://mm-chatgpt.oss-cn-zhangjiakou.aliyuncs.com/mplug_owl_demo/released_checkpoint/mPLUG_Owl_paper.pdf)
+[![](assets/Paper-Arxiv-orange.svg)](https://arxiv.org/abs/2304.14178)
+[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FX-PLUG%2FmPLUG-Owl&count_bg=%23E97EBA&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=visitors&edge_flat=false)](https://hits.seeyoufarm.com)
+-->
+<div align="center">
+<a>English</a> | <a href="README_zh.md">简体中文</a>
+<hr>
+</div>
+<!--
+English | [简体中文](README_zh.md)
+<hr>
+-->
+<div align="center">
+<img src="http://mm-chatgpt.oss-cn-zhangjiakou.aliyuncs.com/mplug_owl_demo/released_checkpoint/sample.gif"  width="60%">
+</div>
+
+## Examples
+![Training paradigm and model overview](assets/case_1.png "Training paradigm and model overview")
+![Training paradigm and model overview](assets/case_2.png "Training paradigm and model overview")
+
+## News
+* 🔥 [05.19] mPLUG-Owl is now *natively support Huggingface* style usage with Huggingface Trainer. Users can train their customized models with only *a single V100 GPU* now! We also refactorize the project code and much easier to use. The offline demo can be inferenced with only *a single 16GB T4 GPU* with *8 bits* support! 
+* 🔥 [05.16] We retrain our model based on video-image-text data. Online demo has been updated and new checkpoints and improved code are coming soon.
+* 🔥 [05.16] Online demo on [HuggingFace](https://huggingface.co/spaces/MAGAer13/mPLUG-Owl) now supports 8 bits!
+* 🔥 [05.12] Online demo and API available on [Replicate](https://replicate.com/joehoover/mplug-owl)!
+* [05.05] We released code and dataset for instruction tuning.
+* [05.05] Online demo on [HuggingFace](https://huggingface.co/spaces/MAGAer13/mPLUG-Owl) is available. Thank Huggingface for providing us with free computing resources!
+* [05.05] Online demo on HuggingFace now supports recieve video! Demo on ModelScope will support soon.
+* [05.05] We upload our visually-related evaluation set **OwlEval**.
+* [04.26] We provide an [online demo](https://modelscope.cn/studios/damo/mPLUG-Owl/summary) on modelscope for the public to experience.
+* [04.26] We released code of mPLUG-Owl🦉 with its pre-trained and instruction tuning checkpoints.
+
+## Spotlights
+* A new training paradigm with a **modularized design** for large multi-modal language models.
+* Learns visual knowledge while support **multi-turn conversation** consisting of different modalities (images/videos/texts).
+* Observed abilities such as **multi-image correlation** and **scene text understanding**, **vision-based document comprehension**.
+* Release a visually-related instruction evaluation set **OwlEval**.
+* Our outstanding works on modularization:
+  * [E2E-VLP](https://aclanthology.org/2021.acl-long.42/), [mPLUG](https://aclanthology.org/2022.emnlp-main.488/) and [mPLUG-2](https://arxiv.org/abs/2302.00402), were respectively accepted by ACL 2021, EMNLP 2022 and ICML 2023.
+  * [mPLUG](https://aclanthology.org/2022.emnlp-main.488/) is the first to achieve the human parity on [VQA Challenge](https://eval.ai/web/challenges/challenge-page/830/leaderboard/2278).
+* Comming soon
+  - [ ] Multi-lingustic support (e.g., Chinese, Japanese, Germen, French, etc.)
+  - [ ] Instruction tuning on interleaved data (multiple images and videos).
+  - [x] Publish on Huggingface Hub / Model Hub
+  - [x] Huggingface space demo.
+  - [x] Instruction tuning code and pre-training code.
+  - [x] A visually-related evaluation set **OwlEval** to comprehensively evaluate various models.
+
+## Compatibility with v0 branch
+
+The code in the current main branch has been refactored in Huggingface style, and several issues with the model have been fixed. We have re-trained the models and released new checnpoints in Huggingface Hub. As a result, the old code and new checkpoints are incompatible. We have moved that code into the v0 branch.
+
+
+![Training paradigm and model overview](assets/model.png "Training paradigm and model overview")
+
+## Online Demo
+
+### Replicate
+
+[![Replicate](https://replicate.com/replicate/mplug-owl/badge)](https://replicate.com/joehoover/mplug-owl)
+
+### ModelScope
+<a href="https://www.modelscope.cn/studios/damo/mPLUG-Owl/summary"><img src="https://modelscope.oss-cn-beijing.aliyuncs.com/modelscope.gif" width="250"/></a>
+
+### Hugging Face
+<!-- [![Demo of mPLUG-Owl on Modelscope](assets/modelscopeIcon.svg)](https://www.modelscope.cn/studios/damo/mPLUG-Owl/summary) -->
+
+[![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-xl-dark.svg)](https://huggingface.co/spaces/MAGAer13/mPLUG-Owl)
+
+<!-- ![](assets/modelscope.png) -->
+
+## Checkpoints on Huggingface Model Hub
+|Model|Phase|Download link|
+|-|-|-|
+|mPLUG-Owl 7B|Pre-training|[Download link](https://huggingface.co/MAGAer13/mplug-owl-llama-7b-pt)|
+|mPLUG-Owl 7B|Instruction tuning (LoRA)|[Download link](https://huggingface.co/MAGAer13/mplug-owl-llama-7b)|
+|mPLUG-Owl 7B|Instruction tuning (FT)|[Download link](https://huggingface.co/MAGAer13/mplug-owl-llama-7b-ft)|
+
+## OwlEval
+The evaluation dataset OwlEval can be found in ```./OwlEval```.
+
+## Usage
+### Install Requirements
+1. Create conda environment
+```bash
+conda create -n mplug_owl python=3.10
+conda activate mplug_owl
+```
+
+2. Install PyTorch
+
+```
+conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.7 -c pytorch -c nvidia
+```
+
+3. Install other dependencies
+```bash
+pip install -r requirements.txt
+```
+
+### Local Demo
+We provide a script to deploy a simple demo in your local machine.
+```Bash
+python -m serve.web_server --base-model 'your checkpoint directory' --bf16
+```
+For example, if you want to load the checkpoint `MAGAer13/mplug-owl-llama-7b` from Huggingface Model Hub, here is it.
+```Bash
+python -m serve.web_server --base-model MAGAer13/mplug-owl-llama-7b --bf16
+```
+
+### Run Model with Huggingface Style
+#### Model initialization
+If you want to load the model (e.g. `MAGAer13/mplug-owl-llama-7b`) from the model hub on Huggingface or on local, you can use the following code snippet.
+```Python
+# Load via Huggingface Style
+from mplug_owl.modeling_mplug_owl import MplugOwlForConditionalGeneration
+from mplug_owl.tokenization_mplug_owl import MplugOwlTokenizer
+from mplug_owl.processing_mplug_owl import MplugOwlImageProcessor, MplugOwlProcessor
+
+pretrained_ckpt = 'MAGAer13/mplug-owl-llama-7b'
+model = MplugOwlForConditionalGeneration.from_pretrained(
+    pretrained_ckpt,
+    torch_dtype=torch.bfloat16,
+)
+image_processor = MplugOwlImageProcessor.from_pretrained(pretrained_ckpt)
+tokenizer = MplugOwlTokenizer.from_pretrained(pretrained_ckpt)
+processor = MplugOwlProcessor(image_processor, tokenizer)
+```
+
+#### Model inference
+Prepare model inputs.
+```Python
+# We use a human/AI template to organize the context as a multi-turn conversation.
+# <image> denotes an image placehold.
+prompts = [
+'''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+Human: <image>
+Human: Explain why this meme is funny.
+AI: ''']
+
+# The image paths should be placed in the image_list and kept in the same order as in the prompts.
+# We support urls, local file paths and base64 string. You can custom the pre-process of images by modifying the mplug_owl.modeling_mplug_owl.ImageProcessor
+image_list = ['https://xxx.com/image.jpg']
+```
+
+Get response.
+```Python
+# generate kwargs (the same in transformers) can be passed in the do_generate()
+generate_kwargs = {
+    'do_sample': True,
+    'top_k': 5,
+    'max_length': 512
+}
+from PIL import Image
+images = [Image.open(_) for _ in image_list]
+inputs = processor(text=prompts, images=images, return_tensors='pt')
+inputs = {k: v.bfloat16() if v.dtype == torch.float else v for k, v in inputs.items()}
+inputs = {k: v.to(model.device) for k, v in inputs.items()}
+with torch.no_grad():
+    res = model.generate(**inputs, **generate_kwargs)
+sentence = tokenizer.decode(res.tolist()[0], skip_special_tokens=True)
+print(sentence)
+```
+
+
+### Run Model with provided interface
+#### Model initialization
+Build model, toknizer and processor.
+```Python
+from pipeline.interface import get_model
+model, tokenizer, processor = get_model(pretrained_ckpt='your checkpoint directory', use_bf16='use bf16 or not')
+```
+
+#### Model inference
+Prepare model inputs.
+```Python
+# We use a human/AI template to organize the context as a multi-turn conversation.
+# <image> denotes an image placehold.
+prompts = [
+'''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+Human: <image>
+Human: Explain why this meme is funny.
+AI: ''']
+
+# The image paths should be placed in the image_list and kept in the same order as in the prompts.
+# We support urls, local file paths and base64 string. You can custom the pre-process of images by modifying the mplug_owl.modeling_mplug_owl.ImageProcessor
+image_list = ['https://xxx.com/image.jpg',]
+```
+
+For multiple images inputs, as it is an emergent ability of the models, we do not know which format is the best. Below is an example format we have tried in our experiments. Exploring formats that can help models better understand multiple images could be beneficial and worth further investigation.
+```Python
+prompts = [
+'''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+Human: <image>
+Human: <image>
+Human: Do the shirts worn by the individuals in the first and second pictures vary in color? If so, what is the specific color of each shirt?
+AI: ''']
+
+image_list = ['https://xxx.com/image_1.jpg', 'https://xxx.com/image_2.jpg']
+```
+
+Get response.
+```Python
+# generate kwargs (the same in transformers) can be passed in the do_generate()
+from pipeline.interface import do_generate
+sentence = do_generate(prompts, image_list, model, tokenizer, processor, 
+                       use_bf16=True, max_length=512, top_k=5, do_sample=True)
+```
+### Instruction Tuning
+The training samples are stored in ```xxx.jsonl``` and orgnized in the following format:
+```json
+{"text": "The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\nHuman: Construct a sentence using the given verb in the past tense\nshudder\nAI: She shuddered at the thought of being alone in the dark.", "task_type": "gpt4instruct_sft"}
+{"image": ["xxx.jpg"], "text": "The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\nHuman: <image>\nHuman: Where is the man located in the image?\nAI: The man is located in a small restaurant, sitting and enjoying his meal.\nHuman: What type of food is the man eating in the image?\nAI: The man is eating a burrito in the image.\nHuman: What can be seen next to the man in the restaurant?\nAI: Next to the man in the restaurant, there is a fridge filled with cans of soda.\nHuman: How does the man feel as he is about to eat his meal?\nAI: The man looks happy and is smiling as he is about to take a bite of his meal.\nHuman: What unique aspect of the photo makes the scene appear distorted or unusual?\nAI: The unique aspect of this photo is that it was taken using a fish-eye lens, which creates a distorted, wide-angle view of the scene, making it appear different from a standard perspective.", "task_type": "llava_sft"}
+```
+The ```task_type``` can be in one of ```{'quora_chat_sft', 'sharegpt_chat_sft', 'llava_sft', 'gpt4instruct_sft'}```.
+
+Prepare your own train.jsonl and dev.jsonl and modify ```data_files``` in ```configs/v0.yaml```.
+
+Execute the training script.
+```
+PYTHONPATH=./ bash train_it.sh # If you want to finetune LLM, replace it with train_it_wo_lora.sh
+```
+
+### Usage with Replicate
+
+You can also now run the model and demo locally with [`cog`](https://github.com/replicate/cog), an open source ML tool maintained by Replicate. To get started, follow the instructions in this `cog` [fork](https://github.com/replicate/cog-mplug-owl) of `mPLUG-Owl`.
+
+## Performance Comparison
+The comparison results of 50 single-turn responses (left) and 52 multi-turn responses (right) between mPLUG-Owl and baselines with manual evaluation metrics. A/B/C/D denote the rate of each response.
+![Comparison Results](assets/mPLUG_Owl_compare_result_s&mturn.png)
+
+
+## Related Projects
+
+* [LLaMA](https://github.com/facebookresearch/llama). A open-source collection of state-of-the-art large pre-trained language models.
+* [Baize](https://github.com/project-baize/baize-chatbot). An open-source chat model trained with LoRA on 100k dialogs generated by letting ChatGPT chat with itself.
+* [Alpaca](https://github.com/tatsu-lab/stanford_alpaca). A fine-tuned model trained from a 7B LLaMA model on 52K instruction-following data.
+* [LoRA](https://github.com/microsoft/LoRA). A plug-and-play module that can greatly reduce the number of trainable parameters for downstream tasks.
+* [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4). A multi-modal language model that aligns a frozen visual encoder with a frozen LLM using just one projection layer.
+* [LLaVA](https://github.com/haotian-liu/LLaVA). A visual instruction tuned vision language model which achieves GPT4 level capabilities.
+* [mPLUG](https://github.com/alibaba/AliceMind/tree/main/mPLUG). A vision-language foundation model for both cross-modal understanding and generation.
+* [mPLUG-2](https://github.com/alibaba/AliceMind). A multimodal model with a modular design, which inspired our project.
+
+## Citation
+If you found this work useful, consider giving this repository a star and citing our paper as followed:
+```
+@misc{ye2023mplugowl,
+      title={mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality}, 
+      author={Qinghao Ye and Haiyang Xu and Guohai Xu and Jiabo Ye and Ming Yan and Yiyang Zhou and Junyang Wang and Anwen Hu and Pengcheng Shi and Yaya Shi and Chaoya Jiang and Chenliang Li and Yuanhong Xu and Hehong Chen and Junfeng Tian and Qian Qi and Ji Zhang and Fei Huang},
+      year={2023},
+      eprint={2304.14178},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
diff --git a/models/mPLUG_Owl/README_zh.md b/models/mPLUG_Owl/README_zh.md
new file mode 100644
index 0000000..8bf4aa9
--- /dev/null
+++ b/models/mPLUG_Owl/README_zh.md
@@ -0,0 +1,211 @@
+<div align="center">
+<img src="assets/mPLUG_new1.png" width="80%">
+</div>
+
+# mPLUG-Owl🦉: Modularization Empowers Large Language Models with Multimodality
+<div align="center">
+Qinghao Ye*, Haiyang Xu*, Guohai Xu*, Jiabo Ye, Ming Yan†, Yiyang Zhou, Junyang Wang, Anwen Hu, Pengcheng Shi, Yaya Shi, Chaoya Jiang, Chenliang Li, Yuanhong Xu, Hehong Chen, Junfeng Tian, Qian Qi, Ji Zhang, Fei Huang
+</div>
+
+<div align="center">
+<strong>阿里巴巴集团，达摩院</strong>
+</div>
+
+<div align="center">
+*Equal Contribution; † Corresponding Author
+</div>
+
+<div align="center">
+    <a href="https://huggingface.co/spaces/MAGAer13/mPLUG-Owl"><img src="https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm-dark.svg" alt="Open in Spaces"></a>
+    <a href="https://modelscope.cn/studios/damo/mPLUG-Owl/summary"><img src="assets/Demo-ModelScope-brightgreen.svg" alt="Demo ModelScope"></a>
+    <a href="https://github.com/X-PLUG/mPLUG-Owl/blob/main/LICENSE"><img src="assets/LICENSE-Apache%20License-blue.svg" alt="License"></a>
+    <a href="http://mm-chatgpt.oss-cn-zhangjiakou.aliyuncs.com/mplug_owl_demo/released_checkpoint/mPLUG_Owl_paper.pdf"><img src="assets/Paper-PDF-orange.svg"></a>
+    <a href="https://arxiv.org/abs/2304.14178"><img src="assets/Paper-Arxiv-orange.svg" ></a>
+    <a href="https://hits.seeyoufarm.com"><img src="https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FX-PLUG%2FmPLUG-Owl&count_bg=%23E97EBA&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=visitors&edge_flat=false" alt="Hits"></a>
+    <a href="https://twitter.com/xuhaiya2483846/status/1654640739010351106"><img src='assets/-twitter-blue.svg'></a>
+</div>
+
+<div align="center">
+<a href="README.md">English</a> | <a>简体中文</a>
+<hr>
+</div>
+
+<!--
+[![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm-dark.svg)](https://huggingface.co/spaces/MAGAer13/mPLUG-Owl)
+[![](assets/Demo-ModelScope-brightgreen.svg)](https://modelscope.cn/studios/damo/mPLUG-Owl/summary)
+[![](assets/LICENSE-Apache%20License-blue.svg)](https://github.com/X-PLUG/mPLUG-Owl/blob/main/LICENSE)
+[![](assets/Paper-PDF-orange.svg)](http://mm-chatgpt.oss-cn-zhangjiakou.aliyuncs.com/mplug_owl_demo/released_checkpoint/mPLUG_Owl_paper.pdf)
+[![](assets/Paper-Arxiv-orange.svg)](https://arxiv.org/abs/2304.14178)
+[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FX-PLUG%2FmPLUG-Owl&count_bg=%23E97EBA&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=visitors&edge_flat=false)](https://hits.seeyoufarm.com)
+
+[English](README.md) | 简体中文
+<hr>
+-->
+<div align="center">
+<img src="http://mm-chatgpt.oss-cn-zhangjiakou.aliyuncs.com/mplug_owl_demo/released_checkpoint/sample.gif"  width="60%">
+</div>
+
+## 示例
+![Training paradigm and model overview](assets/case_1.png "Training paradigm and model overview")
+![Training paradigm and model overview](assets/case_2.png "Training paradigm and model overview")
+
+## 最新更新
+* 🔥 [05.19] mPLUG-Owl现在*原生支持 Huggingface*的用法和支持Huggingface Trainer训练，仅需*1张32G的V100*即可开启训练! 我们重构了代码移除了Apex的依赖. 离线Demo支持*8比特*进行推理，仅需要1张*16GB T4*即可部署! 
+* 🔥 [05.16] 我们基于视频图文数据联合训练了我们的模型，在线demo已经更新。更新后的checkpoint和代码会很快和大家见面！
+* 🔥 [05.16] [HuggingFace](https://huggingface.co/spaces/MAGAer13/mPLUG-Owl) 上的在线demo现在支持8bit了！
+* 🔥 [05.12] 在线 demo 和 API 已经开放在[Replicate](https://replicate.com/joehoover/mplug-owl)!
+* [05.05] 我们发布了指令微调的代码。
+* [05.05] 我们在[HuggingFace](https://huggingface.co/spaces/MAGAer13/mPLUG-Owl)上也搭建了Demo。感谢HuggingFace提供的免费算力！
+* [05.05] HuggingFace上的Demo现在已经支持视频输入！ModelScope上的Demo也即将支持。
+* [05.05] 我们公开了视觉相关指令的测评集**OwlEval**
+* [04.26] 我们在Modelscope上提供了一个[在线Demo](https://modelscope.cn/studios/damo/mPLUG-Owl/summary)供大家体验。
+* [04.26] 我们开放了mPLUG-Owl🦉，以及推理代码和二阶段微调参数。
+
+## 亮点特色
+* 一种面向多模态语言模型的**模块化**的训练范式。
+* 能学习与语言空间相适应的视觉知识，并支持在多模态场景(支持图片、视频、文本输入)下进行**多轮对话**。
+* 涌现**多图关系理解**，**场景文本理解**和**基于视觉的文档理解**等能力。
+* 提出了针对视觉相关指令的测评集**OwlEval**，用以评估多模态语言模型的对带有视觉信息上下文的理解能力。
+* 我们在模块化上的一些探索:
+  * [E2E-VLP](https://aclanthology.org/2021.acl-long.42/), [mPLUG](https://aclanthology.org/2022.emnlp-main.488/) 和 [mPLUG-2](https://arxiv.org/abs/2302.00402), 分别被ACL 2021, EMNLP 2022 and ICML 2023接收。
+  * [mPLUG](https://aclanthology.org/2022.emnlp-main.488/) 首次在[VQA Challenge](https://eval.ai/web/challenges/challenge-page/830/leaderboard/2278)上超越人类。
+* 即将发布
+  - [ ] 多语言支持（中文、日文等）。
+  - [ ] 在多图片/视频数据上训练的模型
+  - [X] 在HuggingFace Hub上发布。
+  - [x] Huggingface 在线Demo
+  - [x] 指令微调代码。
+  - [x] 视觉相关指令的测评集**OwlEval**
+
+## 与v0版本的兼容性
+我们在main分支上将代码用huggingface的使用风格进行了完整重构，同时对模型中的一些错误进行了修复并重新训练，因此新的checkpoint和旧的代码是不兼容的。我们已经将旧的代码移到v0分支，你可以切换到v0分支以使用先前的checkpoints。
+
+![Training paradigm and model overview](assets/model.png "Training paradigm and model overview")
+
+## 在线Demo
+### ModelScope
+<a href="https://www.modelscope.cn/studios/damo/mPLUG-Owl/summary"><img src="https://modelscope.oss-cn-beijing.aliyuncs.com/modelscope.gif" width="250"/></a>
+
+### Hugging Face
+<!-- [![Demo of mPLUG-Owl on Modelscope](assets/modelscopeIcon.svg)](https://www.modelscope.cn/studios/damo/mPLUG-Owl/summary) -->
+
+[![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-xl-dark.svg)](https://huggingface.co/spaces/MAGAer13/mPLUG-Owl)
+![](assets/modelscope.png)
+## 模型权重 Huggingface Model Hub
+|Model|Phase|Download link|
+|-|-|-|
+|mPLUG-Owl 7B|Pre-training|[下载链接](https://huggingface.co/MAGAer13/mplug-owl-llama-7b-pt)|
+|mPLUG-Owl 7B|Instruction tuning (LoRA)|[下载链接](https://huggingface.co/MAGAer13/mplug-owl-llama-7b)|
+|mPLUG-Owl 7B|Instruction tuning (FT)|[下载链接](https://huggingface.co/MAGAer13/mplug-owl-llama-7b-ft)|
+
+## OwlEval
+我们所使用的评测集放在 [```./OwlEval```](OwlEval/OwlEval.md) 中。
+
+## 使用
+### 安装依赖
+1. 创建conda环境
+```bash
+conda create -n mplug_owl python=3.10
+conda activate mplug_owl
+```
+
+2. 安装PyTorch
+
+```
+conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.7 -c pytorch -c nvidia
+```
+
+3. 安装其它依赖
+```bash
+pip install -r requirements.txt
+```
+
+### 本地部署Demo
+我们提供了一个易扩展的脚本来一键部署本地Demo，你可以根据自己的需求进行修改。
+```Bash
+python -m serve.web_server --base-model 'your checkpoint directory' --bf16
+```
+### 推理
+如果要实现自定义的推理，可以参考以下步骤。
+
+构建model, tokenizer, processor
+```Python
+from pipeline.interface import get_model
+model, tokenizer, processor = get_model(pretrained_ckpt='your checkpoint directory', use_bf16='use bf16 or not')
+```
+准备模型输入
+```Python
+# We use a human/AI template to organize the context as a multi-turn conversation.
+# <image> denotes an image placehold.
+prompts = [
+'''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+Human: <image>
+Human: Explain why this meme is funny.
+AI: ''']
+
+# The image paths should be placed in the image_list and kept in the same order as in the prompts.
+# We support urls, local file paths and base64 string. You can custom the pre-process of images by modifying the mplug_owl.modeling_mplug_owl.ImageProcessor
+image_list = ['https://xxx.com/image.jpg',]
+```
+
+对于多张图片，由于这是模型涌现的能力（训练数据并不包含多图的样本），我们并不知道何种格式对模型而言是最佳的。下面是我们在实验中所使用的格式。大家也可以自行探索能让模型更好理解多图关系的格式或者prompt。
+```Python
+prompts = [
+'''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+Human: <image>
+Human: <image>
+Human: Do the shirts worn by the individuals in the first and second pictures vary in color? If so, what is the specific color of each shirt?
+AI: ''']
+
+image_list = ['https://xxx.com/image_1.jpg', 'https://xxx.com/image_2.jpg']
+```
+
+获取模型回复
+```Python
+# generate kwargs (the same in transformers) can be passed in the do_generate()
+from pipeline.interface import do_generate
+sentence = do_generate(prompts, image_list, model, tokenizer, processor, 
+                       use_bf16=True, max_length=512, top_k=5, do_sample=True)
+```
+### 指令微调
+训练样本存放在```xxx.jsonl``` 中，格式如下:
+```json
+{"text": "The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\nHuman: Construct a sentence using the given verb in the past tense\nshudder\nAI: She shuddered at the thought of being alone in the dark.", "task_type": "gpt4instruct_sft"}
+{"image": ["xxx.jpg"], "text": "The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\nHuman: <image>\nHuman: Where is the man located in the image?\nAI: The man is located in a small restaurant, sitting and enjoying his meal.\nHuman: What type of food is the man eating in the image?\nAI: The man is eating a burrito in the image.\nHuman: What can be seen next to the man in the restaurant?\nAI: Next to the man in the restaurant, there is a fridge filled with cans of soda.\nHuman: How does the man feel as he is about to eat his meal?\nAI: The man looks happy and is smiling as he is about to take a bite of his meal.\nHuman: What unique aspect of the photo makes the scene appear distorted or unusual?\nAI: The unique aspect of this photo is that it was taken using a fish-eye lens, which creates a distorted, wide-angle view of the scene, making it appear different from a standard perspective.", "task_type": "llava_sft"}
+```
+其中 ```task_type``` 可以为 ```{'quora_chat_sft', 'sharegpt_chat_sft', 'llava_sft', 'gpt4instruct_sft'}```。
+
+准备好 train.jsonl 和 dev.jsonl 并修改 ```configs/v0.yaml``` 中的 ```data_files```.
+
+执行训练脚本。
+```
+PYTHONPATH=./ bash train_it.sh # 如果你想训练整个LLM，替换为 train_it_wo_lora.sh
+```
+## 性能比较
+我们展示了50个单轮对话（左）和52个多轮对话（右）在人工评估指标下，mPLUG-Owl和基线方法的比较结果。A/B/C/D表示评分人员对每个回复的评级。
+
+![Comparison Results](assets/mPLUG_Owl_compare_result_s&mturn.png)
+
+## 相关项目
+
+* [LLaMA](https://github.com/facebookresearch/llama). 开源的大型预训练语言模型系列。
+* [Baize](https://github.com/project-baize/baize-chatbot). 使用LoRA在10万个通过让ChatGPT自聊生成的对话上进行训练的开源聊天模型。
+* [Alpaca](https://github.com/tatsu-lab/stanford_alpaca). 从7B LLaMA模型上进行微调训练的，用于52K个指令数据的模型。
+* [LoRA](https://github.com/microsoft/LoRA). 即插即用的模块，可以极大地减少下游任务的可训练参数数量。
+* [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4). 仅使用一个线性映射层，将冻结的语言模型和视觉编码器结合的多模态语言模型。
+* [LLaVA](https://github.com/haotian-liu/LLaVA). 经过视觉指令调整的视觉语言模型，可以实现GPT4级别的能力。
+* [mPLUG](https://github.com/alibaba/AliceMind/tree/main/mPLUG). 视觉语言基础模型，可以用于跨模态理解和生成。
+* [mPLUG-2](https://github.com/alibaba/AliceMind). 具有模块化设计的多模态模型，启发了我们的项目。
+
+## 引用
+如果我们的工作对你有帮助，可以考虑给我们的仓库点个star & 引用我们的论文。
+```
+@misc{ye2023mplugowl,
+      title={mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality}, 
+      author={Qinghao Ye and Haiyang Xu and Guohai Xu and Jiabo Ye and Ming Yan and Yiyang Zhou and Junyang Wang and Anwen Hu and Pengcheng Shi and Yaya Shi and Chaoya Jiang and Chenliang Li and Yuanhong Xu and Hehong Chen and Junfeng Tian and Qian Qi and Ji Zhang and Fei Huang},
+      year={2023},
+      eprint={2304.14178},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
\ No newline at end of file
diff --git a/models/mPLUG_Owl/assets/-twitter-blue.svg b/models/mPLUG_Owl/assets/-twitter-blue.svg
new file mode 100644
index 0000000..e345f45
--- /dev/null
+++ b/models/mPLUG_Owl/assets/-twitter-blue.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="47" height="20" role="img" aria-label="twitter"><title>twitter</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="47" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="0" height="20" fill="#007ec6"/><rect x="0" width="47" height="20" fill="#007ec6"/><rect width="47" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="235" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="370">twitter</text><text x="235" y="140" transform="scale(.1)" fill="#fff" textLength="370">twitter</text></g></svg>
\ No newline at end of file
diff --git a/models/mPLUG_Owl/assets/Demo-ModelScope-brightgreen.svg b/models/mPLUG_Owl/assets/Demo-ModelScope-brightgreen.svg
new file mode 100644
index 0000000..7977f11
--- /dev/null
+++ b/models/mPLUG_Owl/assets/Demo-ModelScope-brightgreen.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="118" height="20" role="img" aria-label="Demo: ModelScope"><title>Demo: ModelScope</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="118" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="75" height="20" fill="#4c1"/><rect width="118" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Demo</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Demo</text><text aria-hidden="true" x="795" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="650">ModelScope</text><text x="795" y="140" transform="scale(.1)" fill="#fff" textLength="650">ModelScope</text></g></svg>
\ No newline at end of file
diff --git a/models/mPLUG_Owl/assets/LICENSE-Apache License-blue.svg b/models/mPLUG_Owl/assets/LICENSE-Apache License-blue.svg
new file mode 100644
index 0000000..63d75a6
--- /dev/null
+++ b/models/mPLUG_Owl/assets/LICENSE-Apache License-blue.svg	
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="154" height="20" role="img" aria-label="LICENSE: Apache License"><title>LICENSE: Apache License</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="154" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="59" height="20" fill="#555"/><rect x="59" width="95" height="20" fill="#007ec6"/><rect width="154" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="305" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="490">LICENSE</text><text x="305" y="140" transform="scale(.1)" fill="#fff" textLength="490">LICENSE</text><text aria-hidden="true" x="1055" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="850">Apache License</text><text x="1055" y="140" transform="scale(.1)" fill="#fff" textLength="850">Apache License</text></g></svg>
\ No newline at end of file
diff --git a/models/mPLUG_Owl/assets/Paper-Arxiv-orange.svg b/models/mPLUG_Owl/assets/Paper-Arxiv-orange.svg
new file mode 100644
index 0000000..ad3f5e7
--- /dev/null
+++ b/models/mPLUG_Owl/assets/Paper-Arxiv-orange.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="80" height="20" role="img" aria-label="Paper: Arxiv"><title>Paper: Arxiv</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="80" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="41" height="20" fill="#555"/><rect x="41" width="39" height="20" fill="#fe7d37"/><rect width="80" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="215" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="310">Paper</text><text x="215" y="140" transform="scale(.1)" fill="#fff" textLength="310">Paper</text><text aria-hidden="true" x="595" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="290">Arxiv</text><text x="595" y="140" transform="scale(.1)" fill="#fff" textLength="290">Arxiv</text></g></svg>
\ No newline at end of file
diff --git a/models/mPLUG_Owl/assets/Paper-PDF-orange.svg b/models/mPLUG_Owl/assets/Paper-PDF-orange.svg
new file mode 100644
index 0000000..5cd877f
--- /dev/null
+++ b/models/mPLUG_Owl/assets/Paper-PDF-orange.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="72" height="20" role="img" aria-label="Paper: PDF"><title>Paper: PDF</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="72" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="41" height="20" fill="#555"/><rect x="41" width="31" height="20" fill="#fe7d37"/><rect width="72" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="215" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="310">Paper</text><text x="215" y="140" transform="scale(.1)" fill="#fff" textLength="310">Paper</text><text aria-hidden="true" x="555" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="210">PDF</text><text x="555" y="140" transform="scale(.1)" fill="#fff" textLength="210">PDF</text></g></svg>
\ No newline at end of file
diff --git a/models/mPLUG_Owl/assets/case_1.png b/models/mPLUG_Owl/assets/case_1.png
new file mode 100644
index 0000000..756d214
Binary files /dev/null and b/models/mPLUG_Owl/assets/case_1.png differ
diff --git a/models/mPLUG_Owl/assets/case_2.png b/models/mPLUG_Owl/assets/case_2.png
new file mode 100644
index 0000000..b779660
Binary files /dev/null and b/models/mPLUG_Owl/assets/case_2.png differ
diff --git a/models/mPLUG_Owl/assets/mPLUG_Owl_compare_result_s&mturn.png b/models/mPLUG_Owl/assets/mPLUG_Owl_compare_result_s&mturn.png
new file mode 100644
index 0000000..79f5a42
Binary files /dev/null and b/models/mPLUG_Owl/assets/mPLUG_Owl_compare_result_s&mturn.png differ
diff --git a/models/mPLUG_Owl/assets/mPLUG_new.png b/models/mPLUG_Owl/assets/mPLUG_new.png
new file mode 100644
index 0000000..ead26e1
Binary files /dev/null and b/models/mPLUG_Owl/assets/mPLUG_new.png differ
diff --git a/models/mPLUG_Owl/assets/mPLUG_new1.png b/models/mPLUG_Owl/assets/mPLUG_new1.png
new file mode 100644
index 0000000..2b50a6d
Binary files /dev/null and b/models/mPLUG_Owl/assets/mPLUG_new1.png differ
diff --git a/models/mPLUG_Owl/assets/model.png b/models/mPLUG_Owl/assets/model.png
new file mode 100644
index 0000000..af68bb2
Binary files /dev/null and b/models/mPLUG_Owl/assets/model.png differ
diff --git a/models/mPLUG_Owl/assets/modelscope.png b/models/mPLUG_Owl/assets/modelscope.png
new file mode 100644
index 0000000..c7ba2f7
Binary files /dev/null and b/models/mPLUG_Owl/assets/modelscope.png differ
diff --git a/models/mPLUG_Owl/assets/modelscopeIcon.svg b/models/mPLUG_Owl/assets/modelscopeIcon.svg
new file mode 100644
index 0000000..093ba6f
--- /dev/null
+++ b/models/mPLUG_Owl/assets/modelscopeIcon.svg
@@ -0,0 +1 @@
+<svg viewBox="0 0 901.77 121.33" xmlns="http://www.w3.org/2000/svg"><g fill="#27254c"><path d="m407.23 42.85h-16.82v56.06h48.59v-56.06zm14.95 43.92h-14.95v-31.77h14.95z"/><path d="m672.43 42.85v12.14 31.77 12.15h44.85v-12.15h-28.03v-31.77h28.03v-12.14h-28.03z"/><path d="m745.74 42.85h-16.82v56.06h48.59v-56.06zm14.95 43.92h-14.95v-31.77h14.95z"/><path d="m807.54 42.85h-16.82v76.62h16.82v-20.56h31.77v-56.06zm14.95 43.92h-14.95v-31.77h14.95z"/><path d="m484.82 42.85h-31.77v56.06h48.59v-76.62h-16.82zm0 43.92h-14.95v-31.77h14.95z"/><path d="m579.92 22.29h16.82v76.62h-16.82z"/><path d="m532.95 42.85h-16.82v56.53h48.59v-12.62h-31.77v-9.34h31.77v-34.57zm14.95 22.43h-14.95v-10.28h14.95z"/><path d="m884.95 42.85h-31.77v56.06h48.59v-12.15h-31.77v-9.11h31.77v-34.8zm0 22.66h-14.95v-10.51h14.95z"/><path d="m611.16 30.64v12.14 14.66 12.02.13h31.77v17.17h-31.77v12.15h31.77 16.82v-29.32-12.15h-16.82-14.95v-14.66h31.77v-12.14h-31.77z"/><path d="m308.9 31.63h-16.81v16.82 16.82h-16.82v16.82h16.82v16.82h16.82v-16.82h-16.82v-16.82h16.82v-16.82h16.81v-16.82h-16.81z"/><path d="m325.73 65.27v16.82h16.82v-16.82-16.82h-16.82z"/><path d="m342.56 31.63v16.82h16.82v16.82 16.82 16.82h16.81v-16.82-16.82-16.82-16.82h-16.81z"/></g><path d="m0 47.84h25.65v25.65h-25.65z" fill="#624aff"/><path d="m99.14 73.49h25.65v25.65h-25.65z" fill="#624aff"/><path d="m176.09 99.14h-25.65v22.19h47.84v-47.84h-22.19z" fill="#624aff"/><path d="m124.79 47.84h25.65v25.65h-25.65z" fill="#36cfd1"/><path d="m0 22.19h25.65v25.65h-25.65z" fill="#36cfd1"/><path d="m198.28 47.84h25.65v25.65h-25.65z" fill="#624aff"/><path d="m198.28 22.19h25.65v25.65h-25.65z" fill="#36cfd1"/><path d="m150.44 0v22.19h25.65v25.65h22.19v-47.84z" fill="#624aff"/><path d="m73.49 47.84h25.65v25.65h-25.65z" fill="#36cfd1"/><path d="m47.84 22.19h25.65v-22.19h-47.84v47.84h22.19z" fill="#624aff"/><path d="m47.84 73.49h-22.19v47.84h47.84v-22.19h-25.65z" fill="#624aff"/></svg>
\ No newline at end of file
diff --git a/models/mPLUG_Owl/configs/v0.yaml b/models/mPLUG_Owl/configs/v0.yaml
new file mode 100644
index 0000000..9f95986
--- /dev/null
+++ b/models/mPLUG_Owl/configs/v0.yaml
@@ -0,0 +1,12 @@
+data_files: [
+    'sft_v0.1_train.jsonl',
+    'sft_v0.1_dev.jsonl'
+]
+
+train_processors: {
+  sft: {type: 'CaptionProcessor', image_size: 224, min_scale: 0.5, randaug: False}
+}
+
+valid_processors: {
+  sft: {type: 'DefaultProcessor', image_size: 224}
+}
\ No newline at end of file
diff --git a/models/mPLUG_Owl/examples/Yao_Ming.jpeg b/models/mPLUG_Owl/examples/Yao_Ming.jpeg
new file mode 100644
index 0000000..85fb640
Binary files /dev/null and b/models/mPLUG_Owl/examples/Yao_Ming.jpeg differ
diff --git a/models/mPLUG_Owl/examples/ca.jpeg b/models/mPLUG_Owl/examples/ca.jpeg
new file mode 100644
index 0000000..0ec21a6
Binary files /dev/null and b/models/mPLUG_Owl/examples/ca.jpeg differ
diff --git a/models/mPLUG_Owl/examples/fridge.jpg b/models/mPLUG_Owl/examples/fridge.jpg
new file mode 100644
index 0000000..68f3d99
Binary files /dev/null and b/models/mPLUG_Owl/examples/fridge.jpg differ
diff --git a/models/mPLUG_Owl/examples/laundry.jpeg b/models/mPLUG_Owl/examples/laundry.jpeg
new file mode 100644
index 0000000..e327246
Binary files /dev/null and b/models/mPLUG_Owl/examples/laundry.jpeg differ
diff --git a/models/mPLUG_Owl/examples/monalisa-fun.jpg b/models/mPLUG_Owl/examples/monalisa-fun.jpg
new file mode 100644
index 0000000..2661ea3
Binary files /dev/null and b/models/mPLUG_Owl/examples/monalisa-fun.jpg differ
diff --git a/models/mPLUG_Owl/examples/monday.jpg b/models/mPLUG_Owl/examples/monday.jpg
new file mode 100644
index 0000000..01794dc
Binary files /dev/null and b/models/mPLUG_Owl/examples/monday.jpg differ
diff --git a/models/mPLUG_Owl/examples/mug_ad.jpeg b/models/mPLUG_Owl/examples/mug_ad.jpeg
new file mode 100644
index 0000000..60e3921
Binary files /dev/null and b/models/mPLUG_Owl/examples/mug_ad.jpeg differ
diff --git a/models/mPLUG_Owl/examples/rap.jpeg b/models/mPLUG_Owl/examples/rap.jpeg
new file mode 100644
index 0000000..43f2325
Binary files /dev/null and b/models/mPLUG_Owl/examples/rap.jpeg differ
diff --git a/models/mPLUG_Owl/examples/titanic.jpeg b/models/mPLUG_Owl/examples/titanic.jpeg
new file mode 100644
index 0000000..eb58301
Binary files /dev/null and b/models/mPLUG_Owl/examples/titanic.jpeg differ
diff --git a/models/mPLUG_Owl/examples/vga.jpeg b/models/mPLUG_Owl/examples/vga.jpeg
new file mode 100644
index 0000000..3850fa2
Binary files /dev/null and b/models/mPLUG_Owl/examples/vga.jpeg differ
diff --git a/models/mPLUG_Owl/examples/website.jpg b/models/mPLUG_Owl/examples/website.jpg
new file mode 100644
index 0000000..f8ef662
Binary files /dev/null and b/models/mPLUG_Owl/examples/website.jpg differ
diff --git a/models/mPLUG_Owl/mplug_owl/__init__.py b/models/mPLUG_Owl/mplug_owl/__init__.py
new file mode 100644
index 0000000..2020ad3
--- /dev/null
+++ b/models/mPLUG_Owl/mplug_owl/__init__.py
@@ -0,0 +1,77 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+
+
+_import_structure = {
+    "configuration_mplug_owl": ["MPLUG_OWL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MplugOwlConfig"],
+    "processing_mplug_owl": ["MplugOwlImageProcessor", "MplugOwlProcessor"],
+    "tokenization_mplug_owl": ["MplugOwlTokenizer"],
+}
+
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mplug_owl"] = [
+        "MPLUG_OWL_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MplugOwlForConditionalGeneration",
+        "MplugOwlModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_mplug_owl import MPLUG_OWL_PRETRAINED_CONFIG_ARCHIVE_MAP, MplugOwlConfig
+    from .tokenization_mplug_owl import MplugOwlTokenizer
+
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mplug_owl import (
+            MPLUG_OWL_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MplugOwlForConditionalGeneration,
+            MplugOwlModel,
+            MplugOwlPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+
+from .configuration_mplug_owl import *
+from .modeling_mplug_owl import *
+from .processing_mplug_owl import *
+from .tokenization_mplug_owl import *
diff --git a/models/mPLUG_Owl/mplug_owl/__pycache__/__init__.cpython-310.pyc b/models/mPLUG_Owl/mplug_owl/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..a184519
Binary files /dev/null and b/models/mPLUG_Owl/mplug_owl/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/mPLUG_Owl/mplug_owl/__pycache__/configuration_mplug_owl.cpython-310.pyc b/models/mPLUG_Owl/mplug_owl/__pycache__/configuration_mplug_owl.cpython-310.pyc
new file mode 100644
index 0000000..cb1e797
Binary files /dev/null and b/models/mPLUG_Owl/mplug_owl/__pycache__/configuration_mplug_owl.cpython-310.pyc differ
diff --git a/models/mPLUG_Owl/mplug_owl/__pycache__/modeling_mplug_owl.cpython-310.pyc b/models/mPLUG_Owl/mplug_owl/__pycache__/modeling_mplug_owl.cpython-310.pyc
new file mode 100644
index 0000000..833f8d5
Binary files /dev/null and b/models/mPLUG_Owl/mplug_owl/__pycache__/modeling_mplug_owl.cpython-310.pyc differ
diff --git a/models/mPLUG_Owl/mplug_owl/__pycache__/processing_mplug_owl.cpython-310.pyc b/models/mPLUG_Owl/mplug_owl/__pycache__/processing_mplug_owl.cpython-310.pyc
new file mode 100644
index 0000000..653bb35
Binary files /dev/null and b/models/mPLUG_Owl/mplug_owl/__pycache__/processing_mplug_owl.cpython-310.pyc differ
diff --git a/models/mPLUG_Owl/mplug_owl/__pycache__/tokenization_mplug_owl.cpython-310.pyc b/models/mPLUG_Owl/mplug_owl/__pycache__/tokenization_mplug_owl.cpython-310.pyc
new file mode 100644
index 0000000..1dd613d
Binary files /dev/null and b/models/mPLUG_Owl/mplug_owl/__pycache__/tokenization_mplug_owl.cpython-310.pyc differ
diff --git a/models/mPLUG_Owl/mplug_owl/configuration_mplug_owl.py b/models/mPLUG_Owl/mplug_owl/configuration_mplug_owl.py
new file mode 100644
index 0000000..f7a1828
--- /dev/null
+++ b/models/mPLUG_Owl/mplug_owl/configuration_mplug_owl.py
@@ -0,0 +1,298 @@
+# coding=utf-8
+# Copyright 2022 x-plug and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MplugOwl model configuration"""
+import copy
+import os
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from transformers.utils import logging
+from transformers.models.auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+MPLUG_OWL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "MAGAer13/mplug-owl-llama-7b": "https://huggingface.co/MAGAer13/mplug-owl-llama-7b/resolve/main/config.json",
+    # See all MplugOwl models at https://huggingface.co/models?filter=mplug_owl
+}
+
+
+class MplugOwlVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MplugOwlVisionModel`]. It is used to instantiate
+    a
+     mPLUG-Owl vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+     configuration defaults will yield a similar configuration to that of the mPLUG-Owl
+     [x-plug/x_plug-llama-7b](https://huggingface.co/x-plug/x_plug-llama-7b) architecture.
+
+     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+     documentation from [`PretrainedConfig`] for more information.
+
+     Args:
+         hidden_size (`int`, *optional*, defaults to 768):
+             Dimensionality of the encoder layers and the pooler layer.
+         intermediate_size (`int`, *optional*, defaults to 3072):
+             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+         num_hidden_layers (`int`, *optional*, defaults to 12):
+             Number of hidden layers in the Transformer encoder.
+         num_attention_heads (`int`, *optional*, defaults to 12):
+             Number of attention heads for each attention layer in the Transformer encoder.
+         image_size (`int`, *optional*, defaults to 224):
+             The size (resolution) of each image.
+         patch_size (`int`, *optional*, defaults to 32):
+             The size (resolution) of each patch.
+         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+             `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+         layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+             The epsilon used by the layer normalization layers.
+         attention_dropout (`float`, *optional*, defaults to 0.0):
+             The dropout ratio for the attention probabilities.
+         initializer_range (`float`, *optional*, defaults to 0.02):
+             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+         initializer_factor (`float`, *optional*, defaults to 1):
+             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+             testing).
+
+
+     ```"""
+
+    model_type = "mplug_owl_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4096,
+        projection_dim=768,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        use_flash_attn=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.use_flash_attn = use_flash_attn
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from MplugOwlConfig
+        if config_dict.get("model_type") == "mplug-owl":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class MplugOwlVisualAbstractorConfig(PretrainedConfig):
+    model_type = "mplug_owl_visual_abstract"
+
+    def __init__(
+        self,
+        hidden_size=1024,  #
+        num_hidden_layers=6,  #
+        num_attention_heads=16,  #
+        intermediate_size=4096,  #
+        attention_probs_dropout_prob=0.1,  #
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,  #
+        encoder_hidden_size=1024,  #
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.encoder_hidden_size = encoder_hidden_size
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the visual_abstractor config dict if we are loading from MplugOwlConfig
+        if config_dict.get("model_type") == "mplug-owl":
+            config_dict = config_dict["abstractor_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class MplugOwlConfig(PretrainedConfig):
+    r"""
+    [`MplugOwlConfig`] is the configuration class to store the configuration of a [`MplugOwlForConditionalGeneration`].
+    It is used to instantiate a mPLUG-Owl model according to the specified arguments, defining the vision model,
+    Q-Former model and language model configs. Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the mPLUG-Owl [x-plug/x_plug-llama-7b](https://huggingface.co/x-plug/x_plug-llama-7b)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`MplugOwlVisionConfig`].
+        visual_abstractor_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`MplugOwlVisualAbstractorConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     MplugOwlVisionConfig,
+    ...     MplugOwlVisualAbstractorConfig,
+    ...     OPTConfig,
+    ...     MplugOwlConfig,
+    ...     MplugOwlForConditionalGeneration,
+    ... )
+
+    >>> # Initializing a MplugOwlConfig with x-plug/x_plug-llama-7b style configuration
+    >>> configuration = MplugOwlConfig()
+
+    >>> # Initializing a MplugOwlForConditionalGeneration (with random weights) from the x-plug/x_plug-llama-7b style configuration
+    >>> model = MplugOwlForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a MplugOwlConfig from a MplugOwlVisionConfig, MplugOwlVisualAbstractorConfig and any PretrainedConfig
+
+    >>> # Initializing mPLUG-Owl vision, mPLUG-Owl Q-Former and language model configurations
+    >>> vision_config = MplugOwlVisionConfig()
+    >>> visual_abstractor_config = MplugOwlVisualAbstractorConfig()
+    >>> text_config = OPTConfig()
+
+    >>> config = MplugOwlConfig.from_text_vision_configs(vision_config, visual_abstractor_config, text_config)
+    ```"""
+    model_type = "mplug-owl"
+    is_composition = True
+
+    def __init__(
+        self, vision_config=None, visual_abstractor_config=None, text_config=None, num_query_tokens=64, **kwargs
+    ):
+        super().__init__(**kwargs)
+        if vision_config is None:
+            vision_config = MplugOwlVisionConfig().to_dict()
+            logger.info("vision_config is None.")
+
+        if visual_abstractor_config is None:
+            visual_abstractor_config = {}
+            logger.info("abstractor_config is None. ")
+
+        if text_config is None:
+            # we use LLAMA 7b by default
+            from transformers.llama.configuration_llama import LlamaConfig
+
+            text_config = LlamaConfig(pad_token_id=2).to_dict()
+            logger.info("text_config is None.")
+
+        self.vision_config = MplugOwlVisionConfig(**vision_config)
+        self.visual_abstractor_config = MplugOwlVisualAbstractorConfig(**visual_abstractor_config)
+        # self.visual_abstractor_config.layer_norm_eps = 1e-6
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "llama"
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+
+        self.tie_word_embeddings = self.text_config.tie_word_embeddings
+        self.is_encoder_decoder = self.text_config.is_encoder_decoder
+
+        self.num_query_tokens = num_query_tokens
+        # self.visual_abstractor_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+        for attr in dir(self.text_config):
+            if not hasattr(self, attr):
+                setattr(self, attr, getattr(self.text_config, attr))
+
+    @classmethod
+    def from_vision_visual_abstractor_text_configs(
+        cls,
+        vision_config: MplugOwlVisionConfig,
+        visual_abstractor_config: MplugOwlVisualAbstractorConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`MplugOwlConfig`] (or a derived class) from a mPLUG-Owl vision model, Q-Former and language
+        model configurations.
+
+        Returns:
+            [`MplugOwlConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            visual_abstractor_config=visual_abstractor_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["visual_abstractor_config"] = self.visual_abstractor_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/models/mPLUG_Owl/mplug_owl/modeling_mplug_owl.py b/models/mPLUG_Owl/mplug_owl/modeling_mplug_owl.py
new file mode 100644
index 0000000..cfb50bf
--- /dev/null
+++ b/models/mPLUG_Owl/mplug_owl/modeling_mplug_owl.py
@@ -0,0 +1,1584 @@
+# coding=utf-8
+# Copyright 2022 x-plug The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch MplugOwl model."""
+
+import math
+from typing import Any, Optional, Tuple, Union
+
+import torch
+
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_func
+
+    flash_attn_func = flash_attn_unpadded_func
+except ImportError:
+    flash_attn_func = None
+    print("install flash-attn first.")
+from dataclasses import dataclass
+
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.auto import AutoModelForCausalLM
+from .configuration_mplug_owl import MplugOwlConfig, MplugOwlVisionConfig, MplugOwlVisualAbstractorConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "MAGAer13/mplug-owl-llama-7b"
+_CONFIG_FOR_DOC = "MplugOwlConfig"
+
+
+MPLUG_OWL_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "MAGAer13/mplug-owl-llama-7b",
+    # See all MplugOwl models at https://huggingface.co/models?filter=mplug_owl
+]
+
+
+@dataclass
+class MplugOwlForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`MPlugOwlForConditionalGeneration`].
+
+    Args:
+        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Language modeling loss from the language model.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head of the language model.
+        vision_outputs (`BaseModelOutputWithPooling`):
+            Outputs of the vision encoder.
+
+        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+            Outputs of the language model.
+    """
+
+    loss: Optional[Tuple[torch.FloatTensor]] = None
+    logits: Optional[Tuple[torch.FloatTensor]] = None
+    vision_outputs: Optional[torch.FloatTensor] = None
+    language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["vision_outputs", "language_model_outputs"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+def get_ltor_masks_and_position_ids_from_embeddings(data):
+    """Build masks and position id for left to right model."""
+
+    # Extract batch size and sequence length.
+    micro_batch_size, seq_length = data.size()[:2]
+
+    # Attention mask (lower triangular).
+    att_mask_batch = 1
+    attention_mask = torch.tril(torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)).view(
+        att_mask_batch, 1, seq_length, seq_length
+    )
+
+    # Loss mask.
+    loss_mask = torch.ones(data.size()[:2], dtype=torch.float, device=data.device)
+
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data[..., 0])
+
+    # Convert attention mask to binary:
+    attention_mask = attention_mask < 0.5
+
+    return attention_mask, loss_mask, position_ids
+
+
+class MplugOwlVisionEmbeddings(nn.Module):
+    def __init__(self, config: MplugOwlVisionConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, self.hidden_size))
+
+        self.patch_embed = nn.Conv2d(
+            in_channels=3,
+            out_channels=self.hidden_size,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_patches + 1, self.hidden_size))
+
+        self.pre_layernorm = LayerNormFp32(self.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.size(0)
+        image_embeds = self.patch_embed(pixel_values)
+        image_embeds = image_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.cls_token.expand(batch_size, 1, -1).to(image_embeds.dtype)
+        embeddings = torch.cat([class_embeds, image_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1)].to(image_embeds.dtype)
+        embeddings = self.pre_layernorm(embeddings)
+        return embeddings
+
+
+class LayerNormFp32(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, x: torch.Tensor):
+        output = torch.nn.functional.layer_norm(
+            x.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        )
+        return output.type_as(x)
+
+
+class MplugOwlVisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        if self.head_dim * self.num_heads != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+        self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size)
+        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, seq_len, embed_dim = hidden_states.size()
+
+        mixed_qkv = self.query_key_value(hidden_states)
+
+        mixed_qkv = mixed_qkv.reshape(bsz, seq_len, self.num_heads, 3, embed_dim // self.num_heads).permute(
+            3, 0, 2, 1, 4
+        )  # [3, b, np, sq, hn]
+        query_states, key_states, value_states = (
+            mixed_qkv[0],
+            mixed_qkv[1],
+            mixed_qkv[2],
+        )
+        # if self.config.use_flash_attn and flash_attn_func is not None:
+        if False:
+            # [b*sq, np, hn]
+            query_states = query_states.permute(0, 2, 1, 3).contiguous()
+            query_states = query_states.view(query_states.size(0) * query_states.size(1), query_states.size(2), -1)
+
+            key_states = key_states.permute(0, 2, 1, 3).contiguous()
+            key_states = key_states.view(key_states.size(0) * key_states.size(1), key_states.size(2), -1)
+
+            value_states = value_states.permute(0, 2, 1, 3).contiguous()
+            value_states = value_states.view(value_states.size(0) * value_states.size(1), value_states.size(2), -1)
+
+            cu_seqlens = torch.arange(
+                0, (bsz + 1) * seq_len, step=seq_len, dtype=torch.int32, device=query_states.device
+            )
+
+            context_layer = flash_attn_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens,
+                cu_seqlens,
+                seq_len,
+                seq_len,
+                self.dropout if self.training else 0.0,
+                softmax_scale=self.scale,
+                causal=False,
+                return_attn_probs=False,
+            )
+            # [b*sq, np, hn] => [b, sq, np, hn]
+            context_layer = context_layer.view(bsz, seq_len, context_layer.size(1), context_layer.size(2))
+        else:
+            # Take the dot product between "query" and "key" to get the raw attention scores.
+            attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+
+            attention_scores = attention_scores * self.scale
+
+            # Normalize the attention scores to probabilities.
+            attention_probs = torch.softmax(attention_scores, dim=-1)
+
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = self.dropout(attention_probs)
+
+            # Mask heads if we want to
+            if head_mask is not None:
+                attention_probs = attention_probs * head_mask
+
+            context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output = self.dense(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class MplugOwlMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = QuickGELU()
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class MplugOwlVisionEncoderLayer(nn.Module):
+    def __init__(self, config: MplugOwlVisionConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = MplugOwlVisionAttention(config)
+        self.input_layernorm = LayerNormFp32(self.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = MplugOwlMLP(config)
+        self.post_attention_layernorm = LayerNormFp32(self.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class MplugOwlPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MplugOwlConfig
+    base_model_prefix = "mplug_owl"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids",
+        r"language_model.encoder.embed_tokens.weight",
+        r"language_model.decoder.embed_tokens.weight",
+        r"language_model.lm_head.weight",
+    ]
+    _no_split_modules = [
+        "MplugOwlVisionEncoderLayer",
+        "LlamaDecoderLayer",
+        "MplugOwlVisualAbstractorLayer",
+        "LlamaForCausalLM",
+        "Parameter",
+    ]
+    _keep_in_fp32_modules = ["wo"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+
+        if isinstance(module, MplugOwlVisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
+            nn.init.trunc_normal_(module.cls_token, mean=0.0, std=factor)
+
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+        elif isinstance(module, nn.Parameter):
+            raise ValueError
+            nn.init.trunc_normal_(module.data, mean=0.0, std=factor)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MplugOwlVisionEncoder):
+            module.gradient_checkpointing = value
+
+
+MPLUG_OWL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MplugOwlConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MPLUG_OWL_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`MplugOwlProcessor`]. See [`MplugOwlProcessor.__call__`]
+            for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+MPLUG_OWL_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
+            Training](./t5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+MPLUG_OWL_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`MplugOwlProcessor`]. See [`MplugOwlProcessor.__call__`]
+            for details.
+
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
+            provided to serve as text prompt, which the language model can continue.
+
+            Indices can be obtained using [`MplugOwlProcessor`]. See [`MplugOwlProcessor.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an
+            encoder-decoder language model (like T5) is used.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            Only relevant in case an encoder-decoder language model (like T5) is used.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class MplugOwlVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`MplugOwlVisionEncoderLayer`].
+
+    Args:
+        config (`MplugOwlVisionConfig`):
+            The corresponding vision configuration for the `MplugOwlEncoder`.
+    """
+
+    def __init__(self, config: MplugOwlVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([MplugOwlVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class MplugOwlVisionModel(MplugOwlPreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = MplugOwlVisionConfig
+
+    def __init__(self, config: MplugOwlVisionConfig):
+        super().__init__(config)
+        self.config = config
+        self.hidden_size = config.hidden_size
+
+        self.embeddings = MplugOwlVisionEmbeddings(config)
+        self.encoder = MplugOwlVisionEncoder(config)
+        self.post_layernorm = LayerNormFp32(self.hidden_size, eps=config.layer_norm_eps)
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MPLUG_OWL_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=MplugOwlVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+class MplugOwlVisualAbstractorMLP(nn.Module):
+    def __init__(self, config: MplugOwlVisualAbstractorConfig):
+        super().__init__()
+        self.config = config
+        in_features = config.hidden_size
+        self.act = nn.SiLU()
+
+        self.w1 = nn.Linear(in_features, config.intermediate_size)
+        self.w2 = nn.Linear(config.intermediate_size, in_features)
+        self.w3 = nn.Linear(in_features, config.intermediate_size)
+        self.ffn_ln = LayerNormFp32(config.intermediate_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.act(self.w1(hidden_states)) * self.w3(hidden_states)
+        hidden_states = self.ffn_ln(hidden_states)
+        hidden_states = self.w2(hidden_states)
+        return hidden_states
+
+
+class MplugOwlVisualAbstractorMultiHeadAttention(nn.Module):
+    def __init__(self, config: MplugOwlVisualAbstractorConfig):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+        value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+        attention_mask = encoder_attention_mask
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        if self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class MplugOwlVisualAbstractorCrossOutput(nn.Module):
+    def __init__(self, config: MplugOwlVisualAbstractorConfig):
+        super().__init__()
+        dim = config.hidden_size
+        self.out_proj = nn.Linear(dim, dim, bias=True)
+        self.norm2 = LayerNormFp32(dim)
+        self.mlp = MplugOwlVisualAbstractorMLP(config)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        input_tensor = input_tensor + self.out_proj(hidden_states)
+        input_tensor = input_tensor + self.mlp(self.norm2(input_tensor))
+        return input_tensor
+
+
+class MplugOwlVisualAbstractorAttention(nn.Module):
+    def __init__(self, config: MplugOwlVisualAbstractorConfig):
+        super().__init__()
+        self.attention = MplugOwlVisualAbstractorMultiHeadAttention(config)
+        self.output = MplugOwlVisualAbstractorCrossOutput(config)
+        self.pruned_heads = set()
+        self.norm1 = LayerNormFp32(config.hidden_size)
+        self.normk = LayerNormFp32(config.hidden_size)
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.out_proj, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # HACK we apply norm on q and k
+        hidden_states = self.norm1(hidden_states)
+        encoder_hidden_states = self.normk(encoder_hidden_states)
+        encoder_hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
+        encoder_attention_mask = torch.cat([attention_mask, encoder_attention_mask], dim=-1)
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        # add attentions if we output them
+        outputs = (attention_output,) + self_outputs[1:]
+        return outputs
+
+
+class MplugOwlVisualAbstractorLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+
+        self.layer_idx = layer_idx
+
+        self.crossattention = MplugOwlVisualAbstractorAttention(config)
+        self.has_cross_attention = True
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+    ):
+        if encoder_hidden_states is None:
+            raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+        cross_attention_outputs = self.crossattention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            output_attentions=output_attentions,
+        )
+        query_attention_output = cross_attention_outputs[0]
+
+        outputs = (query_attention_output,)
+        return outputs
+
+
+class MplugOwlVisualAbstractorEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [MplugOwlVisualAbstractorLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layers[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+        )
+
+
+class MplugOwlVisualAbstractorModel(MplugOwlPreTrainedModel):
+    def __init__(self, config: MplugOwlVisualAbstractorConfig, language_hidden_size):
+        super().__init__(config)
+        self.config = config
+
+        self.encoder = MplugOwlVisualAbstractorEncoder(config)
+        self.visual_fc = torch.nn.Linear(config.hidden_size, language_hidden_size)
+        self.vit_eos = torch.nn.Parameter(torch.randn(1, 1, language_hidden_size))
+        nn.init.trunc_normal_(self.vit_eos, mean=0.0, std=self.config.initializer_range)
+        self.post_init()
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_shape: Tuple[int],
+        device: torch.device,
+    ) -> torch.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+            device: (`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        query_embeds,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        embedding_output = query_embeds
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (query_embeds.shape[0], query_embeds.shape[1]), dtype=torch.long, device=query_embeds.device
+            )
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        sequence_output = self.visual_fc(sequence_output)
+        sequence_output = torch.cat([sequence_output, self.vit_eos.repeat(sequence_output.shape[0], 1, 1)], dim=1)
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    mPLUG-Owl Model.
+    """,
+    MPLUG_OWL_START_DOCSTRING,
+)
+class MplugOwlModel(MplugOwlPreTrainedModel):
+    config_class = MplugOwlConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: MplugOwlConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.vision_model = MplugOwlVisionModel(config.vision_config)
+
+        self.query_tokens = nn.Parameter(
+            torch.zeros(1, config.num_query_tokens, config.visual_abstractor_config.hidden_size)
+        )
+        self.abstractor = MplugOwlVisualAbstractorModel(
+            config.visual_abstractor_config, config.text_config.hidden_size
+        )
+
+        # if config.use_decoder_only_language_model:
+        # from llama.modeling_llama import LlamaForCausalLM
+        language_model = AutoModelForCausalLM.from_config(config.text_config)
+        # else:
+        #     language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+        self.language_model = language_model
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.use_decoder_only_language_model:
+            text_outputs = self.language_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        else:
+            inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+
+            text_outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+
+        return text_outputs
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        if pixel_values is not None:
+            pixel_values = pixel_values.to(self.vision_model.embeddings.cls_token.data.dtype)
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return vision_outputs
+
+
+def get_media_indices(my_list):
+    if isinstance(my_list, torch.Tensor):
+        my_list = my_list.cpu().tolist()
+    result = []
+    for i in range(len(my_list)):
+        if i == 0 and my_list[i] < 0:
+            result.append(i)
+        elif my_list[i] != my_list[i - 1] and my_list[i] < 0:
+            result.append(i)
+    return result
+
+
+@add_start_docstrings(
+    """
+    mPLUG-Owl Model for generating text given an image and an optional text prompt.
+    """,
+    MPLUG_OWL_START_DOCSTRING,
+)
+class MplugOwlForConditionalGeneration(MplugOwlPreTrainedModel):
+    config_class = MplugOwlConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: MplugOwlConfig):
+        super().__init__(config)
+
+        self.vision_model = MplugOwlVisionModel(config.vision_config)
+
+        self.query_tokens = nn.Parameter(
+            torch.zeros(1, config.num_query_tokens, config.visual_abstractor_config.hidden_size)
+        )
+        self.abstractor = MplugOwlVisualAbstractorModel(
+            config.visual_abstractor_config, config.text_config.hidden_size
+        )
+        language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.language_model = language_model
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.main_input_name = "input_ids"
+        from transformers import GenerationConfig
+
+        self.generation_config = GenerationConfig(
+            max_length=512, do_sample=True, top_k=3, pad_token_id=0, unk_token_id=0, bos_token_id=1, eos_token_id=2
+        )
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+
+    def _preprocess_accelerate(self):
+        r"""
+        Some pre-processing hacks to make the model `accelerate` compatible. Check
+        https://github.com/huggingface/transformers/pull/21707 for more details.
+        """
+        hf_device_map = self.hf_device_map
+
+        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
+            # warn users about unexpected behavior when using multi-GPU + mPLUG-Owl + `accelerate`.
+            logger.warning(
+                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
+                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
+                " Please pass a `device_map` that contains `language_model` to remove this warning."
+                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
+                " more details on creating a `device_map` for large models.",
+            )
+
+        if hasattr(self.language_model, "_hf_hook"):
+            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility
+
+    @add_start_docstrings_to_model_forward(MPLUG_OWL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=MplugOwlForConditionalGenerationModelOutput, config_class=MplugOwlVisionConfig
+    )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: torch.FloatTensor,
+        num_images,
+        non_padding_mask: Optional[torch.LongTensor] = None,
+        non_media_mask: Optional[torch.LongTensor] = None,
+        prompt_mask: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MplugOwlForConditionalGenerationModelOutput]:
+        r"""
+        Returns:
+
+        SFT example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import MplugOwlProcessor, MplugOwlForConditionalGeneration
+        >>> import torch
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        >>> processor = MplugOwlProcessor.from_pretrained("MAGAer13/mplug-owl-llama-7b")
+        >>> model = MplugOwlForConditionalGeneration.from_pretrained(
+        ...     "MAGAer13/mplug-owl-llama-7b", torch_dtype=torch.float16
+        ... )
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> prompt = [
+        ...     "The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\nHuman: <image>\nHuman: how many cats are there?\nAI: "
+        ... ]
+        >>> inputs = processor(images=[image], text=prompt, return_tensors="pt").to(device, torch.float16)
+
+        >>> generated_ids = model.generate(**inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        There are two cats in the image.
+        ```"""
+        if pixel_values is not None:
+            pixel_values = pixel_values.to(self.vision_model.embeddings.cls_token.data.dtype)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # get text embedding
+        text_tokens_ = input_ids.clone()
+        batch_size = input_ids.shape[0]
+        # labels = text_tokens_[:, 1:].clone().contiguous()
+
+        media_token_indices = [
+            # [:-1] since we would not use the last token for embedding
+            get_media_indices(text_tokens_[i][:-1])
+            for i in range(batch_size)
+        ]
+        text_tokens_[text_tokens_ < 0] = 1  # Not used
+        # text_tokens = text_tokens_[:, :-1].contiguous()
+        text_embeds = self.get_input_embeddings()(text_tokens_)  # Temporally Embedding
+
+        if pixel_values is not None:
+            image_embeds = self.vision_model(pixel_values, return_dict=True).last_hidden_state
+
+            image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+            query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+
+            query_features = self.abstractor(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_attention_mask,
+            )["last_hidden_state"]
+            torch.ones(query_features.size()[:-1], dtype=torch.long).to(query_features.device)
+            img_seq_length = query_features.shape[1]
+
+        num_images_per_sample = num_images.long().cpu().tolist()
+
+        text_chunk_embeds = []
+        img_idx = 0
+        for b in range(batch_size):
+            start = 0
+            result = []
+            if len(media_token_indices[b]) > 0:
+                for i, pos in enumerate(media_token_indices[b]):
+                    if pos > start:
+                        result.append(text_embeds[b, start:pos])
+                    result.append(query_features[img_idx + i])
+                    start = pos + img_seq_length
+            if start < text_embeds.shape[1]:
+                result.append(text_embeds[b, start:])
+
+            img_idx += num_images_per_sample[b]
+            text_chunk_embeds.append(torch.cat(result, dim=0))
+
+        # Actual Input Embeddings
+        input_embeds = torch.stack(text_chunk_embeds, dim=0)
+
+        # if pixel_values is None and self.language_model.is_gradient_checkpointing:
+        #     # Hack here when gradient checkpoint is enable.
+        #     # Keep the compute graph static
+        #     image_embeds = self.vision_model(torch.zeros(1,3,224,224,device=input_embeds.device,dtype=input_embeds.dtype), return_dict=True).last_hidden_state
+        #     query_tokens = self.query_tokens.expand(
+        #         image_embeds.shape[0], -1, -1)
+        #     query_features = self.abstractor(query_embeds=query_tokens,
+        #     encoder_hidden_states=image_embeds,)['last_hidden_state']
+
+        #     input_embeds = input_embeds + query_features.mean()*0
+
+        # Create causal mask and position ids
+        _, loss_mask, position_ids = get_ltor_masks_and_position_ids_from_embeddings(input_embeds)
+
+        # Calculate the loss_mask
+        non_padding_mask = non_padding_mask.long()
+        non_media_mask = non_media_mask.long()
+        prompt_mask = prompt_mask.long()  # TODO How to deal with prompt mask
+        # from icecream import ic
+        # non_padding_mask = non_padding_mask[:,:-1]
+        # non_media_mask = non_media_mask[:,:-1]
+        # prompt_mask = prompt_mask[:,:-1]
+        # attention_mask = attention_mask[:,:-1]
+        loss_mask = loss_mask[:, :-1]
+
+        loss_mask = loss_mask * non_padding_mask * non_media_mask * prompt_mask
+        labels[:, 1:][loss_mask != 1] = -100
+        # Forward into GPT
+        outputs = self.language_model(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            labels=labels,
+            return_dict=return_dict,
+            output_attentions=self.config.output_attentions,
+        )
+        # outputs.loss = (outputs.loss * loss_mask.view(-1)
+        #                 ).sum()/loss_mask.sum()
+        return outputs
+
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        isdecoder=True,
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices
+
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        if pixel_values is not None:
+            pixel_values = pixel_values.to(self.vision_model.embeddings.cls_token.data.dtype)
+
+        if input_ids is None:
+            return self.language_model.generate(attention_mask=attention_mask, **generate_kwargs)
+
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(*input_ids.shape)
+
+        batch_size = input_ids.size(0)
+        media_token_indices = [get_media_indices(input_ids[i]) for i in range(batch_size)]
+        num_images_per_sample = [len(x) for x in media_token_indices]
+        input_ids = input_ids.clone()  # prevent inplace modify
+        input_ids[input_ids < 0] = 0  # Not used
+
+        if hasattr(self, "hf_device_map"):
+            # preprocess for `accelerate`
+            self._preprocess_accelerate()
+        batch_size = input_ids.shape[0]
+        # get text embedding
+        inputs_embeds = self.get_input_embeddings()(input_ids)
+        # get visual embedding
+        if pixel_values is not None:
+            pixel_values = pixel_values.to(input_ids.device)
+            with torch.no_grad():
+                image_embeds = self.vision_model(pixel_values, return_dict=True).last_hidden_state
+                image_attention_mask = torch.ones(
+                    image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device
+                )
+                query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+                query_outputs = self.abstractor(
+                    query_embeds=query_tokens,
+                    encoder_hidden_states=image_embeds,
+                    encoder_attention_mask=image_attention_mask,
+                    return_dict=True,
+                )
+                query_output = query_outputs["last_hidden_state"]
+                image_embeds = query_output
+            img_seq_length = image_embeds.shape[1]
+
+            # ===================
+            # Get actual input embeddings
+            # ===================
+            text_chunk_embeds = []
+            text_chunk_attns = []
+            img_idx = 0
+
+            for b in range(batch_size):
+                start = 0
+                result = []
+                result_attn = []
+                for i, pos in enumerate(media_token_indices[b]):
+                    if pos > start:
+                        result.append(inputs_embeds[b, start:pos])
+                        result_attn.append(attention_mask[b, start:pos])
+                    result.append(image_embeds[img_idx + i])
+                    result_attn.append(torch.ones(image_embeds[img_idx + i].shape[0], device=inputs_embeds.device))
+                    start = pos + img_seq_length
+                if start < inputs_embeds.shape[1]:
+                    result.append(inputs_embeds[b, start:])
+                    result_attn.append(attention_mask[b, start:])
+
+                img_idx += num_images_per_sample[b]
+                text_chunk_embeds.append(torch.cat(result, dim=0))
+                text_chunk_attns.append(torch.cat(result_attn, dim=0))
+            inputs_embeds = torch.stack(text_chunk_embeds, dim=0)
+            attention_mask = torch.stack(text_chunk_attns, dim=0)
+
+        outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            # input_ids=input_ids,
+            attention_mask=attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
+
+    def prepare_inputs_for_generation(
+        self, input_ids, pixel_values=None, past_key_values=None, attention_mask=None, **model_kwargs
+    ):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # # cut decoder_input_ids if past_key_values is used
+        # if past_key_values is not None:
+        #     input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "attention_mask": attention_mask,
+            "is_decoder": True,
+        }
diff --git a/models/mPLUG_Owl/mplug_owl/processing_mplug_owl.py b/models/mPLUG_Owl/mplug_owl/processing_mplug_owl.py
new file mode 100644
index 0000000..1c238cb
--- /dev/null
+++ b/models/mPLUG_Owl/mplug_owl/processing_mplug_owl.py
@@ -0,0 +1,208 @@
+import re
+
+import torch
+import torch.utils.checkpoint
+
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import BatchEncoding
+from transformers.models.clip.image_processing_clip import CLIPImageProcessor
+
+
+media_token = ({"image": ("<image>", 65)},)
+
+
+class MplugOwlProcessor(ProcessorMixin):
+    attributes = []
+    tokenizer_class = ("MplugOwlTokenizer")
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        super().__init__(**kwargs)
+        self.tokens_to_generate = 0
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.add_BOS = True
+
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = tokenize_prompts(
+                prompts=text,
+                tokens_to_generate=self.tokens_to_generate,
+                add_BOS=self.add_BOS,
+                tokenizer=self.tokenizer,
+                ignore_dist=True,
+                **kwargs,
+            )
+            # encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return BatchEncoding(data=encoding)
+        elif text is not None:
+            return BatchEncoding(data=encoding)
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+
+    def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+    def decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+
+class MplugOwlImageProcessor(CLIPImageProcessor):
+    pass
+
+
+def detokenize_generations(tokens_gpu_tensor, lengths_gpu_tensor, return_segments, tokenizer):
+    """Detokenize the generated tokens."""
+
+    prompts_plus_generations = []
+    if return_segments:
+        prompts_plus_generations_segments = []
+
+    tokens = tokens_gpu_tensor.cpu().numpy().tolist()
+    lengths = lengths_gpu_tensor.cpu().numpy().tolist()
+    for sequence_tokens, length in zip(tokens, lengths):
+        sequence_tokens = sequence_tokens[:length]
+        prompts_plus_generations.append(tokenizer.detokenize(sequence_tokens))
+        if return_segments:
+            from tokenizers.decoders import Metaspace
+
+            if hasattr(tokenizer, "tokenizer"):
+                if isinstance(tokenizer.tokenizer.decoder, Metaspace):
+                    words = tokenizer.tokenizer.decode(sequence_tokens)
+                else:
+                    words = []
+                    for token in sequence_tokens:
+                        word = tokenizer.tokenizer.decoder[token]
+                        word = bytearray([tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
+                            "utf-8", errors="replace"
+                        )
+                        words.append(word)
+                prompts_plus_generations_segments.append(words)
+            else:
+                words = tokenizer.detokenize(sequence_tokens)
+                # else:
+                #     words = []
+                #     for token in sequence_tokens:
+                #         word = tokenizer.tokenizer.decoder[token]
+                #         word = bytearray(
+                #             [tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
+                #                 'utf-8', errors='replace')
+                #         words.append(word)
+                prompts_plus_generations_segments.append(words)
+
+    if return_segments:
+        return tokens, prompts_plus_generations, prompts_plus_generations_segments
+
+    return tokens, prompts_plus_generations
+
+
+def tokenize_prompts(
+    prompts=None, tokens_to_generate=None, add_BOS=None, rank=0, tokenizer=None, ignore_dist=False, **kwargs
+):
+    """Tokenize prompts and make them avaiable on all ranks."""
+
+    # On all ranks set to None so we can pass them to functions
+    prompts_tokens_cuda_long_tensor = None
+    prompts_length_cuda_long_tensor = None
+
+    # On the specified rank, build the above.
+    attention_mask = None
+    if ignore_dist or torch.distributed.get_rank() == rank:
+        assert prompts is not None
+        assert tokens_to_generate is not None
+        # Tensor of tokens padded and their unpadded length.
+        prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor, attention_mask = _tokenize_prompts_and_batch(
+            prompts, tokens_to_generate, add_BOS, tokenizer, **kwargs
+        )
+
+    return {
+        "input_ids": prompts_tokens_cuda_long_tensor,
+        "attention_mask": attention_mask,
+        # "prompt_length": prompts_length_cuda_long_tensor,
+    }
+
+
+def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS, tokenizer, **kwargs):
+    """Given a set of prompts and number of tokens to generate:
+    - tokenize prompts
+    - set the sequence length to be the max of length of prompts plus the number of tokens we would like to generate
+    - pad all the sequences to this length so we can convert them into a 2D tensor.
+    """
+
+    # Tokenize all the prompts.
+    # if add_BOS:
+    #     prompts_tokens = [[tokenizer.bos] + tokenizer.tokenize(prompt)
+    #                       for prompt in prompts]
+    # else:
+    #     prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts]
+
+    prompts_tokens = [_tokenize_prompt(prompt, tokenizer, add_BOS, **kwargs) for prompt in prompts]
+
+    # Now we have a list of list of tokens which each list has a different
+    # size. We want to extend this list to:
+    #   - incorporate the tokens that need to be generated
+    #   - make all the sequences equal length.
+    # Get the prompts length.
+    prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens]
+    # Get the max prompts length.
+    max_prompt_len = max(prompts_length)
+    # Number of tokens in the each sample of the batch.
+    samples_length = max_prompt_len + tokens_to_generate
+    # Now update the list of list to be of the same size: samples_length.
+    for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length):
+        padding_size = samples_length - prompt_length
+        prompt_tokens.extend([tokenizer.eos_token_id] * padding_size)
+
+    # Now we are in a structured format, we can convert to tensors.
+    prompts_tokens_tensor = torch.LongTensor(prompts_tokens)
+    prompts_length_tensor = torch.LongTensor(prompts_length)
+    attention_mask = torch.zeros(prompts_tokens_tensor.shape[:2])
+    for i, l in enumerate(prompts_length_tensor):
+        attention_mask[i, :l] = 1
+    return prompts_tokens_tensor, prompts_length_tensor, attention_mask
+
+
+def _tokenize_prompt(prompt, tokenizer, add_BOS=False, media_info={"<image>": 65}, **kwargs):
+    media_tokens = {k: -int(i + 1) for i, k in enumerate(media_info.keys())}
+    media_lengths = media_info.copy()
+
+    if add_BOS:
+        prompt_chunk = [tokenizer.bos_token_id]
+    else:
+        prompt_chunk = []
+
+    # Pure Text
+    if all([media_token not in prompt for media_token in media_tokens.keys()]):
+        enc_chunk = prompt_chunk + tokenizer(prompt, add_special_tokens=False, **kwargs)["input_ids"]
+
+    # Multi-Modal Text
+    else:
+        enc_chunk = prompt_chunk
+        pattern = "|".join(map(re.escape, list(media_tokens.keys())))
+        chunk_strs = re.split(f"({pattern})", prompt)
+        chunk_strs = [x for x in chunk_strs if len(x) > 0]
+        for idx, chunk_str in enumerate(chunk_strs):
+            if chunk_str in media_tokens:
+                enc_chunk += [media_tokens[chunk_str]] * media_lengths[chunk_str]
+            else:
+                tmp_chunk = tokenizer(chunk_str, add_special_tokens=False)["input_ids"]
+                # if idx < len(chunk_strs) - 1: # Last chunk should not have eos
+                #     tmp_chunk += [tokenizer.eod_id]
+                enc_chunk += tmp_chunk
+    return enc_chunk
diff --git a/models/mPLUG_Owl/mplug_owl/tokenization_mplug_owl.py b/models/mPLUG_Owl/mplug_owl/tokenization_mplug_owl.py
new file mode 100644
index 0000000..8bb807a
--- /dev/null
+++ b/models/mPLUG_Owl/mplug_owl/tokenization_mplug_owl.py
@@ -0,0 +1,62 @@
+# coding=utf-8
+# Copyright 2022 x-plug and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for MplugOwl."""
+
+from transformers.utils import logging
+from transformers.models.llama.tokenization_llama import LlamaTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "MAGAer13/mplug-owl-llama-7b": "https://huggingface.co/MAGAer13/mplug-owl-llama-7b/resolve/main/vocab.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "MAGAer13/mplug-owl-llama-7b": 1024,
+}
+
+
+class MplugOwlTokenizer(LlamaTokenizer):
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<unk>",
+        sp_model_kwargs=None,
+        add_bos_token=False,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            unk_token,
+            bos_token,
+            eos_token,
+            pad_token,
+            sp_model_kwargs,
+            add_bos_token,
+            add_eos_token,
+            clean_up_tokenization_spaces,
+            **kwargs,
+        )
+        self.eod_id = self.eos_token_id
diff --git a/models/mPLUG_Owl/pipeline/__init__.py b/models/mPLUG_Owl/pipeline/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/mPLUG_Owl/pipeline/__pycache__/__init__.cpython-310.pyc b/models/mPLUG_Owl/pipeline/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..0a0c55f
Binary files /dev/null and b/models/mPLUG_Owl/pipeline/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/mPLUG_Owl/pipeline/__pycache__/mPLUG.cpython-310.pyc b/models/mPLUG_Owl/pipeline/__pycache__/mPLUG.cpython-310.pyc
new file mode 100644
index 0000000..e8fcafa
Binary files /dev/null and b/models/mPLUG_Owl/pipeline/__pycache__/mPLUG.cpython-310.pyc differ
diff --git a/models/mPLUG_Owl/pipeline/data_utils/__init__.py b/models/mPLUG_Owl/pipeline/data_utils/__init__.py
new file mode 100644
index 0000000..ff20e80
--- /dev/null
+++ b/models/mPLUG_Owl/pipeline/data_utils/__init__.py
@@ -0,0 +1,24 @@
+from .processors.builder import build_processors
+from .xgpt3_dataset import MultiModalDataset
+
+def train_valid_test_datasets_provider(data_path, config, tokenizer, seq_length=1024):
+    """Build train and valid datasets."""
+    print('> building train and validation datasets for mPLUG-Owl ...')
+    train_ds, valid_ds = build_train_valid_test_datasets(
+        input_file=data_path,  
+        tokenizer=tokenizer,
+        max_length=seq_length, 
+        config=config)  
+    print("> finished creating mPLUG-Owl datasets ...")
+
+    return train_ds, valid_ds
+
+def build_train_valid_test_datasets(input_file, tokenizer, max_length=80, config=None):
+    train_processors = build_processors(config['train_processors'])
+    valid_processors = build_processors(config['valid_processors'])
+
+    assert len(input_file) == 2 # If you have files more than 2, modify code at here or merger them into train and dev
+    train_ds = MultiModalDataset(input_file[0], tokenizer, train_processors, max_length)
+    valid_ds = MultiModalDataset(input_file[1], tokenizer, valid_processors, max_length)
+    test_ds = None
+    return (train_ds, valid_ds)
diff --git a/models/mPLUG_Owl/pipeline/data_utils/processors/__init__.py b/models/mPLUG_Owl/pipeline/data_utils/processors/__init__.py
new file mode 100644
index 0000000..31d59e3
--- /dev/null
+++ b/models/mPLUG_Owl/pipeline/data_utils/processors/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Alibaba. All rights reserved.
+from .builder import PROCESSORS, build_processors
+from .default_processor import DefaultProcessor
+from .caption_processor import CaptionProcessor
+
+__all__ = [
+    'PROCESSORS', 'build_processors',
+    'DefaultProcessor', 'CaptionProcessor'
+]
\ No newline at end of file
diff --git a/models/mPLUG_Owl/pipeline/data_utils/processors/builder.py b/models/mPLUG_Owl/pipeline/data_utils/processors/builder.py
new file mode 100644
index 0000000..1be2291
--- /dev/null
+++ b/models/mPLUG_Owl/pipeline/data_utils/processors/builder.py
@@ -0,0 +1,12 @@
+import os
+import numpy as np
+
+from data_utils.registry import Registry, build_from_cfg
+
+PROCESSORS = Registry('processors')
+
+def build_processors(processors_cfg):
+    processors = dict()
+    for task, processor in processors_cfg.items():
+        processors[task] = build_from_cfg(processor, PROCESSORS)
+    return processors
diff --git a/models/mPLUG_Owl/pipeline/data_utils/processors/caption_processor.py b/models/mPLUG_Owl/pipeline/data_utils/processors/caption_processor.py
new file mode 100644
index 0000000..8e71a1f
--- /dev/null
+++ b/models/mPLUG_Owl/pipeline/data_utils/processors/caption_processor.py
@@ -0,0 +1,53 @@
+import torch
+from torchvision import transforms
+from PIL import Image
+import random
+
+from data_utils.randaugment import RandomAugment
+from .builder import PROCESSORS
+
+
+@PROCESSORS.register_module()
+class CaptionProcessor:
+    def __init__(self, image_size=224, min_scale = 0.5, randaug=False):
+        self.image_size = image_size
+        self.min_scale = min_scale
+
+        if randaug:
+            self.image_transform = transforms.Compose([                        
+                transforms.RandomResizedCrop(image_size,scale=(min_scale, 1.0), interpolation=Image.BICUBIC),
+                transforms.RandomHorizontalFlip(),
+                RandomAugment(2,7,isPIL=True,augs=['Identity','AutoContrast','Equalize','Brightness','Sharpness',
+                                                'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate']),     
+                transforms.ToTensor(),
+                transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+            ])
+        else:
+            self.image_transform = transforms.Compose([                        
+                transforms.RandomResizedCrop(image_size,scale=(min_scale, 1.0), interpolation=Image.BICUBIC),
+                transforms.RandomHorizontalFlip(),  
+                transforms.ToTensor(),
+                transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+            ])
+        self.text_transform = None
+
+    def __call__(self, image, text):
+        assert image or text
+        
+        if image:
+            image_input = self.image_transform(image)
+        else:
+            image_input = None
+
+        if text:
+            if isinstance(text["prompt"], list):
+                prompt = random.choice(text["prompt"])
+            else:
+                prompt = text["prompt"]
+            text_input = dict(
+                prompt=prompt,
+                completion=text["text"],
+            )
+        else:
+            text_input = None
+        return image_input, text_input
\ No newline at end of file
diff --git a/models/mPLUG_Owl/pipeline/data_utils/processors/default_processor.py b/models/mPLUG_Owl/pipeline/data_utils/processors/default_processor.py
new file mode 100644
index 0000000..71cf963
--- /dev/null
+++ b/models/mPLUG_Owl/pipeline/data_utils/processors/default_processor.py
@@ -0,0 +1,42 @@
+import torch
+from torchvision import transforms
+from PIL import Image
+import random
+
+from data_utils.randaugment import RandomAugment
+from .builder import PROCESSORS
+
+
+@PROCESSORS.register_module()
+class DefaultProcessor:
+    def __init__(self, image_size=224):
+        self.image_size = image_size
+
+        self.image_transform = transforms.Compose([
+            transforms.Resize((image_size, image_size),interpolation=Image.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+        ])
+
+        self.text_transform = None
+
+    def __call__(self, image, text):
+        assert image or text
+
+        if image:
+            image_input = self.image_transform(image)
+        else:
+            image_input = None
+
+        if text:
+            if isinstance(text["prompt"], list):
+                prompt = random.choice(text["prompt"])
+            else:
+                prompt = text["prompt"]
+            text_input = dict(
+                prompt=prompt,
+                completion=text["text"],
+            )
+        else:
+            text_input = None
+        return image_input, text_input
\ No newline at end of file
diff --git a/models/mPLUG_Owl/pipeline/data_utils/randaugment.py b/models/mPLUG_Owl/pipeline/data_utils/randaugment.py
new file mode 100644
index 0000000..e860ff5
--- /dev/null
+++ b/models/mPLUG_Owl/pipeline/data_utils/randaugment.py
@@ -0,0 +1,345 @@
+import cv2
+import numpy as np
+from PIL import Image
+
+
+## aug functions
+def identity_func(img):
+    return img
+
+
+def autocontrast_func(img, cutoff=0):
+    '''
+        same output as PIL.ImageOps.autocontrast
+    '''
+    n_bins = 256
+
+    def tune_channel(ch):
+        n = ch.size
+        cut = cutoff * n // 100
+        if cut == 0:
+            high, low = ch.max(), ch.min()
+        else:
+            hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+            low = np.argwhere(np.cumsum(hist) > cut)
+            low = 0 if low.shape[0] == 0 else low[0]
+            high = np.argwhere(np.cumsum(hist[::-1]) > cut)
+            high = n_bins - 1 if high.shape[0] == 0 else n_bins - 1 - high[0]
+        if high <= low:
+            table = np.arange(n_bins)
+        else:
+            scale = (n_bins - 1) / (high - low)
+            offset = -low * scale
+            table = np.arange(n_bins) * scale + offset
+            table[table < 0] = 0
+            table[table > n_bins - 1] = n_bins - 1
+        table = table.clip(0, 255).astype(np.uint8)
+        return table[ch]
+
+    channels = [tune_channel(ch) for ch in cv2.split(img)]
+    out = cv2.merge(channels)
+    return out
+
+
+def equalize_func(img):
+    '''
+        same output as PIL.ImageOps.equalize
+        PIL's implementation is different from cv2.equalize
+    '''
+    n_bins = 256
+
+    def tune_channel(ch):
+        hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+        non_zero_hist = hist[hist != 0].reshape(-1)
+        step = np.sum(non_zero_hist[:-1]) // (n_bins - 1)
+        if step == 0: return ch
+        n = np.empty_like(hist)
+        n[0] = step // 2
+        n[1:] = hist[:-1]
+        table = (np.cumsum(n) // step).clip(0, 255).astype(np.uint8)
+        return table[ch]
+
+    channels = [tune_channel(ch) for ch in cv2.split(img)]
+    out = cv2.merge(channels)
+    return out
+
+
+def rotate_func(img, degree, fill=(0, 0, 0)):
+    '''
+    like PIL, rotate by degree, not radians
+    '''
+    H, W = img.shape[0], img.shape[1]
+    center = W / 2, H / 2
+    M = cv2.getRotationMatrix2D(center, degree, 1)
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill)
+    return out
+
+
+def solarize_func(img, thresh=128):
+    '''
+        same output as PIL.ImageOps.posterize
+    '''
+    table = np.array([el if el < thresh else 255 - el for el in range(256)])
+    table = table.clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+
+
+def color_func(img, factor):
+    '''
+        same output as PIL.ImageEnhance.Color
+    '''
+    ## implementation according to PIL definition, quite slow
+    #  degenerate = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)[:, :, np.newaxis]
+    #  out = blend(degenerate, img, factor)
+    #  M = (
+    #      np.eye(3) * factor
+    #      + np.float32([0.114, 0.587, 0.299]).reshape(3, 1) * (1. - factor)
+    #  )[np.newaxis, np.newaxis, :]
+    M = (
+            np.float32([
+                [0.886, -0.114, -0.114],
+                [-0.587, 0.413, -0.587],
+                [-0.299, -0.299, 0.701]]) * factor
+            + np.float32([[0.114], [0.587], [0.299]])
+    )
+    out = np.matmul(img, M).clip(0, 255).astype(np.uint8)
+    return out
+
+
+def contrast_func(img, factor):
+    """
+        same output as PIL.ImageEnhance.Contrast
+    """
+    mean = np.sum(np.mean(img, axis=(0, 1)) * np.array([0.114, 0.587, 0.299]))
+    table = np.array([(
+        el - mean) * factor + mean
+        for el in range(256)
+    ]).clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+
+
+def brightness_func(img, factor):
+    '''
+        same output as PIL.ImageEnhance.Contrast
+    '''
+    table = (np.arange(256, dtype=np.float32) * factor).clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+
+
+def sharpness_func(img, factor):
+    '''
+    The differences the this result and PIL are all on the 4 boundaries, the center
+    areas are same
+    '''
+    kernel = np.ones((3, 3), dtype=np.float32)
+    kernel[1][1] = 5
+    kernel /= 13
+    degenerate = cv2.filter2D(img, -1, kernel)
+    if factor == 0.0:
+        out = degenerate
+    elif factor == 1.0:
+        out = img
+    else:
+        out = img.astype(np.float32)
+        degenerate = degenerate.astype(np.float32)[1:-1, 1:-1, :]
+        out[1:-1, 1:-1, :] = degenerate + factor * (out[1:-1, 1:-1, :] - degenerate)
+        out = out.astype(np.uint8)
+    return out
+
+
+def shear_x_func(img, factor, fill=(0, 0, 0)):
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, factor, 0], [0, 1, 0]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+
+
+def translate_x_func(img, offset, fill=(0, 0, 0)):
+    '''
+        same output as PIL.Image.transform
+    '''
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, -offset], [0, 1, 0]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+
+
+def translate_y_func(img, offset, fill=(0, 0, 0)):
+    '''
+        same output as PIL.Image.transform
+    '''
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, 0], [0, 1, -offset]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+
+
+def posterize_func(img, bits):
+    '''
+        same output as PIL.ImageOps.posterize
+    '''
+    out = np.bitwise_and(img, np.uint8(255 << (8 - bits)))
+    return out
+
+
+def shear_y_func(img, factor, fill=(0, 0, 0)):
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, 0], [factor, 1, 0]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+
+
+def cutout_func(img, pad_size, replace=(0, 0, 0)):
+    replace = np.array(replace, dtype=np.uint8)
+    H, W = img.shape[0], img.shape[1]
+    rh, rw = np.random.random(2)
+    pad_size = pad_size // 2
+    ch, cw = int(rh * H), int(rw * W)
+    x1, x2 = max(ch - pad_size, 0), min(ch + pad_size, H)
+    y1, y2 = max(cw - pad_size, 0), min(cw + pad_size, W)
+    out = img.copy()
+    out[x1:x2, y1:y2, :] = replace
+    return out
+
+
+### level to args
+def enhance_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        return ((level / MAX_LEVEL) * 1.8 + 0.1,)
+    return level_to_args
+
+
+def shear_level_to_args(MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * 0.3
+        if np.random.random() > 0.5: level = -level
+        return (level, replace_value)
+
+    return level_to_args
+
+
+def translate_level_to_args(translate_const, MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * float(translate_const)
+        if np.random.random() > 0.5: level = -level
+        return (level, replace_value)
+
+    return level_to_args
+
+
+def cutout_level_to_args(cutout_const, MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * cutout_const)
+        return (level, replace_value)
+
+    return level_to_args
+
+
+def solarize_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * 256)
+        return (level, )
+    return level_to_args
+
+
+def none_level_to_args(level):
+    return ()
+
+
+def posterize_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * 4)
+        return (level, )
+    return level_to_args
+
+
+def rotate_level_to_args(MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * 30
+        if np.random.random() < 0.5:
+            level = -level
+        return (level, replace_value)
+
+    return level_to_args
+
+
+func_dict = {
+    'Identity': identity_func,
+    'AutoContrast': autocontrast_func,
+    'Equalize': equalize_func,
+    'Rotate': rotate_func,
+    'Solarize': solarize_func,
+    'Color': color_func,
+    'Contrast': contrast_func,
+    'Brightness': brightness_func,
+    'Sharpness': sharpness_func,
+    'ShearX': shear_x_func,
+    'TranslateX': translate_x_func,
+    'TranslateY': translate_y_func,
+    'Posterize': posterize_func,
+    'ShearY': shear_y_func,
+}
+
+translate_const = 10
+MAX_LEVEL = 10
+replace_value = (128, 128, 128)
+arg_dict = {
+    'Identity': none_level_to_args,
+    'AutoContrast': none_level_to_args,
+    'Equalize': none_level_to_args,
+    'Rotate': rotate_level_to_args(MAX_LEVEL, replace_value),
+    'Solarize': solarize_level_to_args(MAX_LEVEL),
+    'Color': enhance_level_to_args(MAX_LEVEL),
+    'Contrast': enhance_level_to_args(MAX_LEVEL),
+    'Brightness': enhance_level_to_args(MAX_LEVEL),
+    'Sharpness': enhance_level_to_args(MAX_LEVEL),
+    'ShearX': shear_level_to_args(MAX_LEVEL, replace_value),
+    'TranslateX': translate_level_to_args(
+        translate_const, MAX_LEVEL, replace_value
+    ),
+    'TranslateY': translate_level_to_args(
+        translate_const, MAX_LEVEL, replace_value
+    ),
+    'Posterize': posterize_level_to_args(MAX_LEVEL),
+    'ShearY': shear_level_to_args(MAX_LEVEL, replace_value),
+}
+
+
+class RandomAugment(object):
+
+    def __init__(self, N=2, M=10, isPIL=False, returnPIL=False, augs=[]):
+        self.N = N
+        self.M = M
+        self.isPIL = isPIL
+        self.returnPIL = returnPIL
+        if augs:
+            self.augs = augs       
+        else:
+            self.augs = list(arg_dict.keys())
+
+    def get_random_ops(self):
+        sampled_ops = np.random.choice(self.augs, self.N)
+        return [(op, 0.5, self.M) for op in sampled_ops]
+
+    def __call__(self, img):
+        if self.isPIL:
+            img = np.array(img)            
+        ops = self.get_random_ops()
+        for name, prob, level in ops:
+            if np.random.random() > prob:
+                continue
+            args = arg_dict[name](level)
+            img = func_dict[name](img, *args) 
+        if self.returnPIL:
+            img = img.astype('uint8')
+            img = Image.fromarray(img)
+        return img
+
+
+if __name__ == '__main__':
+    a = RandomAugment()
+    img = np.random.randn(32, 32, 3)
+    a(img)
\ No newline at end of file
diff --git a/models/mPLUG_Owl/pipeline/data_utils/registry.py b/models/mPLUG_Owl/pipeline/data_utils/registry.py
new file mode 100644
index 0000000..923c633
--- /dev/null
+++ b/models/mPLUG_Owl/pipeline/data_utils/registry.py
@@ -0,0 +1,422 @@
+# Copyright (c) Alibaba. All rights reserved.
+import inspect
+import warnings
+import functools
+from functools import partial
+from typing import Any, Dict, Optional
+from collections import abc
+from inspect import getfullargspec
+
+
+def is_seq_of(seq, expected_type, seq_type=None):
+    """Check whether it is a sequence of some type.
+    Args:
+        seq (Sequence): The sequence to be checked.
+        expected_type (type): Expected type of sequence items.
+        seq_type (type, optional): Expected sequence type.
+    Returns:
+        bool: Whether the sequence is valid.
+    """
+    if seq_type is None:
+        exp_seq_type = abc.Sequence
+    else:
+        assert isinstance(seq_type, type)
+        exp_seq_type = seq_type
+    if not isinstance(seq, exp_seq_type):
+        return False
+    for item in seq:
+        if not isinstance(item, expected_type):
+            return False
+    return True
+
+
+def deprecated_api_warning(name_dict, cls_name=None):
+    """A decorator to check if some arguments are deprecate and try to replace
+    deprecate src_arg_name to dst_arg_name.
+    Args:
+        name_dict(dict):
+            key (str): Deprecate argument names.
+            val (str): Expected argument names.
+    Returns:
+        func: New function.
+    """
+
+    def api_warning_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get name of the function
+            func_name = old_func.__name__
+            if cls_name is not None:
+                func_name = f'{cls_name}.{func_name}'
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in arg_names:
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead', DeprecationWarning)
+                        arg_names[arg_names.index(src_arg_name)] = dst_arg_name
+            if kwargs:
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in kwargs:
+
+                        assert dst_arg_name not in kwargs, (
+                            f'The expected behavior is to replace '
+                            f'the deprecated key `{src_arg_name}` to '
+                            f'new key `{dst_arg_name}`, but got them '
+                            f'in the arguments at the same time, which '
+                            f'is confusing. `{src_arg_name} will be '
+                            f'deprecated in the future, please '
+                            f'use `{dst_arg_name}` instead.')
+
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead', DeprecationWarning)
+                        kwargs[dst_arg_name] = kwargs.pop(src_arg_name)
+
+            # apply converted arguments to the decorated method
+            output = old_func(*args, **kwargs)
+            return output
+
+        return new_func
+
+    return api_warning_wrapper
+
+
+
+def build_from_cfg(cfg: Dict,
+                   registry: 'Registry',
+                   default_args: Optional[Dict] = None) -> Any:
+    """Build a module from config dict when it is a class configuration, or
+    call a function from config dict when it is a function configuration.
+
+    Example:
+        >>> MODELS = Registry('models')
+        >>> @MODELS.register_module()
+        >>> class ResNet:
+        >>>     pass
+        >>> resnet = build_from_cfg(dict(type='Resnet'), MODELS)
+        >>> # Returns an instantiated object
+        >>> @MODELS.register_module()
+        >>> def resnet50():
+        >>>     pass
+        >>> resnet = build_from_cfg(dict(type='resnet50'), MODELS)
+        >>> # Return a result of the calling function
+
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "type".
+        registry (:obj:`Registry`): The registry to search the type from.
+        default_args (dict, optional): Default initialization arguments.
+
+    Returns:
+        object: The constructed object.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'type' not in cfg:
+        if default_args is None or 'type' not in default_args:
+            raise KeyError(
+                '`cfg` or `default_args` must contain the key "type", '
+                f'but got {cfg}\n{default_args}')
+    if not isinstance(registry, Registry):
+        raise TypeError('registry must be an mmcv.Registry object, '
+                        f'but got {type(registry)}')
+    if not (isinstance(default_args, dict) or default_args is None):
+        raise TypeError('default_args must be a dict or None, '
+                        f'but got {type(default_args)}')
+
+    args = cfg.copy()
+
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+
+    obj_type = args.pop('type')
+    if isinstance(obj_type, str):
+        obj_cls = registry.get(obj_type)
+        if obj_cls is None:
+            raise KeyError(
+                f'{obj_type} is not in the {registry.name} registry')
+    elif inspect.isclass(obj_type) or inspect.isfunction(obj_type):
+        obj_cls = obj_type
+    else:
+        raise TypeError(
+            f'type must be a str or valid type, but got {type(obj_type)}')
+    try:
+        return obj_cls(**args)
+    except Exception as e:
+        # Normal TypeError does not print class name.
+        raise type(e)(f'{obj_cls.__name__}: {e}')
+
+
+class Registry:
+    """A registry to map strings to classes or functions.
+
+    Registered object could be built from registry. Meanwhile, registered
+    functions could be called from registry.
+
+    Example:
+        >>> MODELS = Registry('models')
+        >>> @MODELS.register_module()
+        >>> class ResNet:
+        >>>     pass
+        >>> resnet = MODELS.build(dict(type='ResNet'))
+        >>> @MODELS.register_module()
+        >>> def resnet50():
+        >>>     pass
+        >>> resnet = MODELS.build(dict(type='resnet50'))
+
+    Please refer to
+    https://mmcv.readthedocs.io/en/latest/understand_mmcv/registry.html for
+    advanced usage.
+
+    Args:
+        name (str): Registry name.
+        build_func(func, optional): Build function to construct instance from
+            Registry, func:`build_from_cfg` is used if neither ``parent`` or
+            ``build_func`` is specified. If ``parent`` is specified and
+            ``build_func`` is not given,  ``build_func`` will be inherited
+            from ``parent``. Default: None.
+        parent (Registry, optional): Parent registry. The class registered in
+            children registry could be built from parent. Default: None.
+        scope (str, optional): The scope of registry. It is the key to search
+            for children registry. If not specified, scope will be the name of
+            the package where class is defined, e.g. mmdet, mmcls, mmseg.
+            Default: None.
+    """
+
+    def __init__(self, name, build_func=None, parent=None, scope=None):
+        self._name = name
+        self._module_dict = dict()
+        self._children = dict()
+        self._scope = self.infer_scope() if scope is None else scope
+
+        # self.build_func will be set with the following priority:
+        # 1. build_func
+        # 2. parent.build_func
+        # 3. build_from_cfg
+        if build_func is None:
+            if parent is not None:
+                self.build_func = parent.build_func
+            else:
+                self.build_func = build_from_cfg
+        else:
+            self.build_func = build_func
+        if parent is not None:
+            assert isinstance(parent, Registry)
+            parent._add_children(self)
+            self.parent = parent
+        else:
+            self.parent = None
+
+    def __len__(self):
+        return len(self._module_dict)
+
+    def __contains__(self, key):
+        return self.get(key) is not None
+
+    def __repr__(self):
+        format_str = self.__class__.__name__ + \
+                     f'(name={self._name}, ' \
+                     f'items={self._module_dict})'
+        return format_str
+
+    @staticmethod
+    def infer_scope():
+        """Infer the scope of registry.
+
+        The name of the package where registry is defined will be returned.
+
+        Example:
+            >>> # in mmdet/models/backbone/resnet.py
+            >>> MODELS = Registry('models')
+            >>> @MODELS.register_module()
+            >>> class ResNet:
+            >>>     pass
+            The scope of ``ResNet`` will be ``mmdet``.
+
+        Returns:
+            str: The inferred scope name.
+        """
+        # We access the caller using inspect.currentframe() instead of
+        # inspect.stack() for performance reasons. See details in PR #1844
+        frame = inspect.currentframe()
+        # get the frame where `infer_scope()` is called
+        infer_scope_caller = frame.f_back.f_back
+        filename = inspect.getmodule(infer_scope_caller).__name__
+        split_filename = filename.split('.')
+        return split_filename[0]
+
+    @staticmethod
+    def split_scope_key(key):
+        """Split scope and key.
+
+        The first scope will be split from key.
+
+        Examples:
+            >>> Registry.split_scope_key('mmdet.ResNet')
+            'mmdet', 'ResNet'
+            >>> Registry.split_scope_key('ResNet')
+            None, 'ResNet'
+
+        Return:
+            tuple[str | None, str]: The former element is the first scope of
+            the key, which can be ``None``. The latter is the remaining key.
+        """
+        split_index = key.find('.')
+        if split_index != -1:
+            return key[:split_index], key[split_index + 1:]
+        else:
+            return None, key
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def scope(self):
+        return self._scope
+
+    @property
+    def module_dict(self):
+        return self._module_dict
+
+    @property
+    def children(self):
+        return self._children
+
+    def get(self, key):
+        """Get the registry record.
+
+        Args:
+            key (str): The class name in string format.
+
+        Returns:
+            class: The corresponding class.
+        """
+        scope, real_key = self.split_scope_key(key)
+        if scope is None or scope == self._scope:
+            # get from self
+            if real_key in self._module_dict:
+                return self._module_dict[real_key]
+        else:
+            # get from self._children
+            if scope in self._children:
+                return self._children[scope].get(real_key)
+            else:
+                # goto root
+                parent = self.parent
+                while parent.parent is not None:
+                    parent = parent.parent
+                return parent.get(key)
+
+    def build(self, *args, **kwargs):
+        return self.build_func(*args, **kwargs, registry=self)
+
+    def _add_children(self, registry):
+        """Add children for a registry.
+
+        The ``registry`` will be added as children based on its scope.
+        The parent registry could build objects from children registry.
+
+        Example:
+            >>> models = Registry('models')
+            >>> mmdet_models = Registry('models', parent=models)
+            >>> @mmdet_models.register_module()
+            >>> class ResNet:
+            >>>     pass
+            >>> resnet = models.build(dict(type='mmdet.ResNet'))
+        """
+
+        assert isinstance(registry, Registry)
+        assert registry.scope is not None
+        assert registry.scope not in self.children, \
+            f'scope {registry.scope} exists in {self.name} registry'
+        self.children[registry.scope] = registry
+
+    @deprecated_api_warning(name_dict=dict(module_class='module'))
+    def _register_module(self, module, module_name=None, force=False):
+        if not inspect.isclass(module) and not inspect.isfunction(module):
+            raise TypeError('module must be a class or a function, '
+                            f'but got {type(module)}')
+
+        if module_name is None:
+            module_name = module.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in self._module_dict:
+                raise KeyError(f'{name} is already registered '
+                               f'in {self.name}')
+            self._module_dict[name] = module
+
+    def deprecated_register_module(self, cls=None, force=False):
+        warnings.warn(
+            'The old API of register_module(module, force=False) '
+            'is deprecated and will be removed, please use the new API '
+            'register_module(name=None, force=False, module=None) instead.',
+            DeprecationWarning)
+        if cls is None:
+            return partial(self.deprecated_register_module, force=force)
+        self._register_module(cls, force=force)
+        return cls
+
+    def register_module(self, name=None, force=False, module=None):
+        """Register a module.
+
+        A record will be added to `self._module_dict`, whose key is the class
+        name or the specified name, and value is the class itself.
+        It can be used as a decorator or a normal function.
+
+        Example:
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module()
+            >>> class ResNet:
+            >>>     pass
+
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module(name='mnet')
+            >>> class MobileNet:
+            >>>     pass
+
+            >>> backbones = Registry('backbone')
+            >>> class ResNet:
+            >>>     pass
+            >>> backbones.register_module(ResNet)
+
+        Args:
+            name (str | None): The module name to be registered. If not
+                specified, the class name will be used.
+            force (bool, optional): Whether to override an existing class with
+                the same name. Default: False.
+            module (type): Module class or function to be registered.
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f'force must be a boolean, but got {type(force)}')
+        # NOTE: This is a walkaround to be compatible with the old api,
+        # while it may introduce unexpected bugs.
+        if isinstance(name, type):
+            return self.deprecated_register_module(name, force=force)
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str) or is_seq_of(name, str)):
+            raise TypeError(
+                'name must be either of None, an instance of str or a sequence'
+                f'  of str, but got {type(name)}')
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            self._register_module(module=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(module):
+            self._register_module(module=module, module_name=name, force=force)
+            return module
+
+        return _register
\ No newline at end of file
diff --git a/models/mPLUG_Owl/pipeline/data_utils/xgpt3_dataset.py b/models/mPLUG_Owl/pipeline/data_utils/xgpt3_dataset.py
new file mode 100644
index 0000000..e02c7b5
--- /dev/null
+++ b/models/mPLUG_Owl/pipeline/data_utils/xgpt3_dataset.py
@@ -0,0 +1,256 @@
+import json
+import logging
+import os
+import random
+import re
+import time
+import traceback
+import warnings
+from io import BytesIO
+
+import h5py
+import numpy as np
+import torch
+from icecream import ic
+from PIL import Image, ImageFile
+from torch.utils.data import Dataset, Subset
+
+from utils import get_args
+
+from .processors import build_processors
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+ImageFile.MAX_IMAGE_PIXELS = None
+Image.MAX_IMAGE_PIXELS = None
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt='%m/%d/%Y %H:%M:%S',
+                    level=logging.INFO)
+warnings.filterwarnings("ignore")
+logger = logging.getLogger(__name__)
+
+
+def load_jsonl(filename):
+    with open(filename, "r", encoding="utf-8") as f:
+        return [json.loads(l.strip("\n")) for l in f.readlines()]
+
+
+class MultiModalDataset(Dataset):
+    """MultiModal dataset"""
+
+    def __init__(self, input_files, tokenizer, processors,
+                 max_length=2048,
+                 media_tokens=['<image>']):
+        args = get_args()
+        self.dataset = []
+        if isinstance(input_files, str):
+            input_files = [input_files]
+        for input_file in input_files:
+            self.dataset += load_jsonl(input_file)
+
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.processors = processors
+        self.media_tokens = {k: -int(i+1) for i, k in enumerate(media_tokens)}
+        self.media_lengths = {'<image>': 1+64}
+        print("num_media_token: ", self.media_lengths)
+        self.bucket = {}
+        print(len(self.dataset))
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def _load_img(self, images):
+        if isinstance(images, str):
+            images = [images]
+        image_pils = []
+        for image_url in images:
+            image = Image.open(image_url).convert('RGB')
+            image_pils.append(image)
+        return image_pils
+
+    def process_data(self, data, processor=None):
+        # Process Image if exists
+        if 'image' in data and len(data['image']) > 0:
+            if 'image_data' in data:
+                images = data['image_data']
+            else:
+                image_urls = data['image']
+                images = self._load_img(image_urls)
+            if processor:
+                images = [processor(image=image, text=None)[0]
+                          for image in images]
+                images = torch.stack(images, dim=0)
+        else:
+            images = None
+
+        # Process Text
+        text = {
+            "prompt": data.get('prompt', ""),
+            "text": data["text"]
+        }
+        if processor:
+            text = processor(image=None, text=text)[1]
+        return images, text
+
+
+    def __getitem__(self, index):
+        data = self.dataset[index]
+        task_type = data.get('task_type', 'dummy_default').split(
+            '_')[-1]  # Get processor type
+        while True:
+            try:
+                # use for processing image-text pairs
+                image, text = self.process_data(
+                    data, self.processors[task_type])
+
+                text_input = self._extract_text_token_from_conversation(
+                    text, self.max_length, index)
+        
+            except Exception as e:
+                traceback.print_exc()
+                # print(e)
+                #logging.info("Get image:{} from oss failed, retry.".format(ann))
+                time.sleep(0.1)
+                index = 0 if index == (len(self) - 1) else index + 1
+                data = self.dataset[index]
+                task_type = data.get(
+                    'task_type', 'dummy_default').split('_')[-1]
+                continue
+            break
+
+        batch_data = {
+            "image": image,
+            "text": text_input
+        }
+
+        return batch_data
+
+    def _extract_text_token_from_conversation(self, data, max_length, index):
+        # output enc_chunk
+        enc_chunk = []
+
+        if self.tokenizer.bos_token_id > 0:
+            prompt_chunk = [self.tokenizer.bos_token_id]
+        else:
+            prompt_chunk = []
+
+        conversation = data["completion"]
+        # For Text only data
+        if all([media_token not in conversation for media_token in self.media_tokens.keys()]):
+            pattern = '|'.join(map(re.escape, ['AI: ', '\nHuman: ']))
+            chunk_strs = re.split(f'({pattern})', conversation)
+            prompt_length = -1
+            stop_flag = False
+            for idx, chunk_str in enumerate(chunk_strs):
+                if idx == 0:
+                    enc_chunk = prompt_chunk + \
+                        self.tokenizer(chunk_str, add_special_tokens=False)[
+                            'input_ids']
+                    enc_length = len(enc_chunk)
+                    label_chunk = [0] * enc_length
+                else:
+                    if chunk_strs[idx-1] == 'AI: ':
+                        curr_chunk = self.tokenizer(
+                            chunk_str, add_special_tokens=False)['input_ids']
+                        if enc_length + len(curr_chunk) >= max_length:
+                            curr_chunk = curr_chunk[:max_length-enc_length]
+                            stop_flag = True
+                        curr_chunk += [self.tokenizer.eos_token_id]
+                        enc_length += len(curr_chunk)
+                        enc_chunk += curr_chunk
+                        label_chunk += [1] * len(curr_chunk)
+                    else:
+                        curr_chunk = self.tokenizer(
+                            chunk_str, add_special_tokens=False)['input_ids']
+                        if enc_length + len(curr_chunk) >= max_length + 1:
+                            curr_chunk = curr_chunk[:max_length+1-enc_length]
+                            stop_flag = True
+                        enc_length += len(curr_chunk)
+                        enc_chunk += curr_chunk
+                        label_chunk += [0] * len(curr_chunk)
+                    if stop_flag:
+                        break
+
+        # For Image-Text Data
+        else:
+            enc_length = 0
+            prompt_length = -2
+            pattern = '|'.join(
+                map(re.escape, list(self.media_tokens.keys()) + ['AI: ', '\nHuman: ']))
+            chunk_strs = re.split(f'({pattern})', conversation)
+            chunk_strs = [x for x in chunk_strs if len(x) > 0]
+            for idx, chunk_str in enumerate(chunk_strs):
+                if enc_length >= max_length + 1:
+                    break
+
+                if idx == 0:
+                    enc_chunk = prompt_chunk + \
+                        self.tokenizer(chunk_str, add_special_tokens=False)[
+                            'input_ids']
+                    enc_length = len(enc_chunk)
+                    label_chunk = [0] * enc_length
+                else:
+                    if chunk_str in self.media_tokens:
+                        # [CLS] + 256 + [EOS]
+                        if enc_length + self.media_lengths[chunk_str] > max_length + 1:
+                            break
+                        else:
+                            enc_chunk += [self.media_tokens[chunk_str]
+                                          ] * self.media_lengths[chunk_str]
+                            enc_length += self.media_lengths[chunk_str]
+                            label_chunk += [0] * self.media_lengths[chunk_str]
+                    else:
+
+                        if chunk_strs[idx-1] == 'AI: ':
+                            curr_chunk = self.tokenizer(
+                                chunk_str, add_special_tokens=False)['input_ids']
+                            if enc_length + len(curr_chunk) >= max_length:
+                                curr_chunk = curr_chunk[:max_length-enc_length]
+                            curr_chunk += [self.tokenizer.eos_token_id]
+                            enc_length += len(curr_chunk)
+                            enc_chunk += curr_chunk
+                            label_chunk += [1] * len(curr_chunk)
+                        else:
+                            curr_chunk = self.tokenizer(
+                                chunk_str, add_special_tokens=False)['input_ids']
+                            if enc_length + len(curr_chunk) >= max_length + 1:
+                                curr_chunk = curr_chunk[:max_length +
+                                                        1-enc_length]
+                            enc_length += len(curr_chunk)
+                            enc_chunk += curr_chunk
+                            label_chunk += [0] * len(curr_chunk)
+
+        if enc_length < max_length + 1:
+            padding_chunk = [self.tokenizer.pad_token_id] * \
+                (max_length + 1 - enc_length)
+            padding_length = len(padding_chunk)
+            label_chunk += [0] * (max_length + 1 - enc_length)
+            enc_chunk = enc_chunk + padding_chunk
+        else:
+            padding_length = 0
+
+        assert enc_length + padding_length == max_length + \
+            1, (index, prompt_length, enc_length,
+                padding_length, max_length + 1)
+        assert len(label_chunk) == max_length + \
+            1, (len(label_chunk), max_length + 1)
+        non_padding_mask = [1 if i < enc_length -
+                            1 else 0 for i in range(max_length)]
+
+        enc_chunk = torch.tensor(enc_chunk).long()
+        non_padding_mask = torch.tensor(non_padding_mask).long()
+        prompt_mask = torch.tensor(label_chunk)[1:].long()
+        prompt_length = torch.tensor([prompt_length]).long()
+
+        # Create loss mask
+        if all([media_token not in conversation for media_token in self.media_tokens.keys()]):
+            non_media_mask = torch.ones_like(non_padding_mask).long()
+        else:
+            tmp_enc_chunk = enc_chunk.clone()
+            tmp_enc_chunk[tmp_enc_chunk >= 0] = 1
+            tmp_enc_chunk[tmp_enc_chunk < 0] = 0
+            non_media_mask = torch.tensor(tmp_enc_chunk).long()
+            non_media_mask = non_media_mask[1:].long()
+        return {'input_ids': enc_chunk, "prompt_length": prompt_length, 'seq_length': enc_length,
+                "non_padding_mask": non_padding_mask, 'non_media_mask': non_media_mask, 'prompt_mask': prompt_mask}
\ No newline at end of file
diff --git a/models/mPLUG_Owl/pipeline/mPLUG.py b/models/mPLUG_Owl/pipeline/mPLUG.py
new file mode 100644
index 0000000..928ff04
--- /dev/null
+++ b/models/mPLUG_Owl/pipeline/mPLUG.py
@@ -0,0 +1,94 @@
+import torch
+import numpy as np
+import requests
+from PIL import Image, ImageOps
+from mplug_owl.modeling_mplug_owl import MplugOwlForConditionalGeneration
+from mplug_owl.tokenization_mplug_owl import MplugOwlTokenizer
+from mplug_owl.processing_mplug_owl import MplugOwlImageProcessor, MplugOwlProcessor
+def resize_image(image, target_size):
+    width, height = image.size
+    aspect_ratio = width / height
+    if aspect_ratio > 1:
+        # 宽度大于高度，以宽度为基准进行 resize
+        new_width = target_size[0]
+        new_height = int(new_width / aspect_ratio)
+    else:
+        # 高度大于宽度，以高度为基准进行 resize
+        new_height = target_size[1]
+        new_width = int(new_height * aspect_ratio)
+    image = image.resize((new_width, new_height))
+    width_diff = target_size[0] - image.size[0]
+    height_diff = target_size[1] - image.size[1]
+    left_padding = 0
+    top_padding = 0
+    right_padding = width_diff - left_padding
+    bottom_padding = height_diff - top_padding
+    padded_image = ImageOps.expand(image, border=(left_padding, top_padding, right_padding, bottom_padding), fill=0)
+    return padded_image
+
+
+def get_model(pretrained_ckpt, use_bf16=False):
+    """Model Provider with tokenizer and processor. 
+
+    Args:
+        pretrained_ckpt (string): The path to pre-trained checkpoint.
+        use_bf16 (bool, optional): Whether to use bfloat16 to load the model. Defaults to False.
+
+    Returns:
+        model: MplugOwl Model
+        tokenizer: MplugOwl text tokenizer
+        processor: MplugOwl processor (including text and image)
+    """
+    model = MplugOwlForConditionalGeneration.from_pretrained(
+        pretrained_ckpt,
+        torch_dtype=torch.bfloat16 if use_bf16 else torch.half,
+    )
+    image_processor = MplugOwlImageProcessor.from_pretrained(pretrained_ckpt)
+    tokenizer = MplugOwlTokenizer.from_pretrained(pretrained_ckpt)
+    processor = MplugOwlProcessor(image_processor, tokenizer)
+    return model, tokenizer, processor
+
+
+def do_generate(prompts, image_list, model, tokenizer, processor, use_bf16=False, **generate_kwargs):
+    """The interface for generation
+
+    Args:
+        prompts (List[str]): The prompt text
+        image_list (List[str]): Paths of images
+        model (MplugOwlForConditionalGeneration): MplugOwlForConditionalGeneration
+        tokenizer (MplugOwlTokenizer): MplugOwlTokenizer
+        processor (MplugOwlProcessor): MplugOwlProcessor
+        use_bf16 (bool, optional): Whether to use bfloat16. Defaults to False.
+
+    Returns:
+        sentence (str): Generated sentence.
+    """
+    inputs = processor(text=prompts, images=image_list, return_tensors='pt')
+    inputs = {k: v.bfloat16() if v.dtype == torch.float else v for k, v in inputs.items()}
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        res = model.generate(**inputs, **generate_kwargs)
+    sentence = tokenizer.decode(res.tolist()[0], skip_special_tokens=True)
+    return sentence
+class mPLUG:
+    def __init__(self, base_model, device) -> None:
+        model, tokenizer, processor = get_model(base_model, use_bf16=True)
+        self.model = model.to(device)
+        self.tokenizer = tokenizer
+        self.processor = processor
+    def generate(self, image, question,name='resize'):
+        prompts = [f'''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
+Human: <image>
+Human: {question}
+AI: ''']
+        image = Image.open(image)
+        #ct80 none 0.3229166666666667 resize  0.8159722222222222
+        if name == 'resize':
+            image = resize_image(image,(224,224))
+        image_list=[image]
+        sentence = do_generate(
+        prompts, image_list, self.model, 
+        self.tokenizer, self.processor, use_bf16=True,
+        max_length=512, top_k=1, do_sample=True)
+        return sentence
+  
\ No newline at end of file
diff --git a/models/mPLUG_Owl/pipeline/train.py b/models/mPLUG_Owl/pipeline/train.py
new file mode 100644
index 0000000..9e3026d
--- /dev/null
+++ b/models/mPLUG_Owl/pipeline/train.py
@@ -0,0 +1,216 @@
+import argparse
+from functools import partial
+
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.distributed import DistributedSampler
+
+from sconf import Config
+from icecream import ic
+from peft import LoraConfig, get_peft_config, get_peft_model
+from transformers import Trainer
+from transformers.training_args import TrainingArguments
+
+from mplug_owl import MplugOwlForConditionalGeneration, MplugOwlTokenizer
+from pipeline.data_utils import train_valid_test_datasets_provider
+from pipeline.utils import batchify, set_args
+
+
+parser = argparse.ArgumentParser()
+# Model
+parser.add_argument('--pretrained-ckpt', type=str, default='MAGAer13/mplug-owl-llama-7b-pt',
+                    help='Path to the pretrained checkpoint.')
+parser.add_argument('--inference_mode', type=bool, default=False,
+                    help='The inference mode.')
+parser.add_argument('--seq-length', type=int, default=1024,
+                    help='Maximum sequence length to process.')
+parser.add_argument('--use-lora', action='store_true', help='LORA.')
+parser.add_argument('--lora-r', type=int, default=8,
+                    help='curvature.')
+parser.add_argument('--lora-alpha', type=int, default=32,
+                    help='The initialization coefficient of lora-alpha.')  
+parser.add_argument('--lora-dropout', type=int, default=0.05,
+                    help='The initialization coefficient of lora_dropout.')
+parser.add_argument('--bf16', action='store_true', default=True,
+                    help='Run model in bfloat16 mode.')
+
+# Data
+parser.add_argument('--mm-config', type=str, default=None, help='Multimodal Config.')
+parser.add_argument('--num-workers', type=int, default=8,
+                    help="Dataloader number of workers.")  
+
+# Training HyperParameters
+parser.add_argument('--train-epochs', type=int, default=3,
+                    help='Total number of epochs to train over all '
+                    'training runs.')
+parser.add_argument('--micro-batch-size', type=int, default=None,
+                    help='Batch size per model instance (local batch size). '
+                    'Global batch size is local batch size times data '
+                    'parallel size times number of micro batches.')
+parser.add_argument('--lr', type=float, default=None,
+                    help='Initial learning rate. Depending on decay style '
+                    'and initial warmup, the learing rate at each '
+                    'iteration would be different.')
+parser.add_argument('--min-lr', type=float, default=1e-6,
+                    help='Minumum value for learning rate. The scheduler'
+                    'clip values below this threshold.')
+parser.add_argument('--weight-decay', type=float, default=0.01,
+                    help='Weight decay coefficient for L2 regularization.')
+parser.add_argument('--gradient-accumulation-steps', type=int, default=8,
+                    help='The gradient accumulation steps.')
+parser.add_argument('--clip-grad', type=float, default=1.0,
+                    help='Gradient clipping based on global L2 norm.')
+parser.add_argument('--adam-beta1', type=float, default=0.9,
+                    help='First coefficient for computing running averages '
+                    'of gradient and its square')
+parser.add_argument('--adam-beta2', type=float, default=0.999,
+                    help='Second coefficient for computing running averages '
+                    'of gradient and its square')
+parser.add_argument('--adam-eps', type=float, default=1e-08,
+                    help='Term added to the denominator to improve'
+                    'numerical stability')
+
+parser.add_argument('--num-warmup-steps', type=int, default=50,
+                    help='The number of warmup steps.')
+parser.add_argument('--num-training-steps', type=int, default=4236,
+                    help='The number of total training steps for lr scheduler.')
+
+# Evaluation & Save
+parser.add_argument('--save-path', type=str, default=None,
+                    help='Output directory to save checkpoints to.')
+parser.add_argument('--save-interval', type=int, default=None,
+                    help='Number of iterations between checkpoint saves.')
+parser.add_argument('--eval-iters', type=int, default=100,
+                    help='Number of iterations to run for evaluation'
+                    'validation/test for.')
+
+# Other
+parser.add_argument('--gradient-checkpointing', action='store_true',
+                    help='The gradient checkpointing.')
+parser.add_argument('--logging-nan-inf-filter', action='store_true',
+                    help='The logging nan inf filter.')
+parser.add_argument('--ddp-find-unused-parameters', action='store_true',
+                    help='unused parameters finding.')
+parser.add_argument('--do-train', action='store_true', default=True,
+                    help='Whether to do training.')  
+parser.add_argument('--local_rank', type=int, default=-1,
+                    help='Local rank')
+
+
+
+class CustomTrainer(Trainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        
+    def get_train_dataloader(self) -> DataLoader:
+        dataset = self.train_dataset
+        sampler = DistributedSampler(dataset)
+        return torch.utils.data.DataLoader(
+            dataset, batch_size=self._train_batch_size,
+            sampler=sampler,
+            num_workers=self.args.dataloader_num_workers,
+            drop_last=True,
+            pin_memory=True,
+            collate_fn=batchify)
+
+
+    def get_eval_dataloader(self, eval_dataset: Dataset | None = None) -> DataLoader:
+        dataset = self.eval_dataset
+        sampler = DistributedSampler(dataset, shuffle=False)
+        return torch.utils.data.DataLoader(
+            dataset, batch_size=self._train_batch_size,
+            sampler=sampler,
+            num_workers=self.args.dataloader_num_workers,
+            drop_last=True,
+            pin_memory=True,
+            collate_fn=batchify)
+
+
+def main():
+    args, left_argv = parser.parse_known_args()  
+    ic(left_argv)
+    config = Config(args.mm_config)
+
+    set_args(args)
+
+    model = MplugOwlForConditionalGeneration.from_pretrained(
+        args.pretrained_ckpt,
+        torch_dtype=torch.bfloat16 if args.bf16 else torch.half,
+    )
+    tokenizer = MplugOwlTokenizer.from_pretrained(args.pretrained_ckpt)
+
+    if args.use_lora:
+        for param in model.parameters():
+            # freeze base model's layers
+            param.requires_grad = False
+        peft_config = LoraConfig(
+            target_modules=r'.*language_model.*\.(q_proj|v_proj)', 
+            inference_mode=args.inference_mode, 
+            r=args.lora_r, 
+            lora_alpha=args.lora_alpha, 
+            lora_dropout=args.lora_dropout
+        )
+        model = get_peft_model(model, peft_config)
+        model.print_trainable_parameters()
+
+        if args.gradient_checkpointing:
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+            model.language_model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+            model.language_model.apply(
+                partial(model.language_model._set_gradient_checkpointing, value=True))
+
+    else:
+        for name, param in model.named_parameters():
+            if 'language_model' in name:
+                param.requires_grad = True
+            else:
+                param.requires_grad = False
+        if args.gradient_checkpointing:
+            model.language_model.apply(
+                partial(model.language_model._set_gradient_checkpointing, value=True))
+
+    model.train()
+
+    train_data, valid_data = train_valid_test_datasets_provider(
+        config.data_files, config=config, 
+        tokenizer=tokenizer, seq_length=args.seq_length
+    )
+
+    trainer = CustomTrainer(
+        model=model,
+        train_dataset=train_data,
+        eval_dataset=valid_data,
+        args=TrainingArguments(
+            learning_rate=args.lr,
+            warmup_steps=args.num_warmup_steps,
+            do_train=args.do_train,
+            num_train_epochs=args.train_epochs,
+            output_dir=args.save_path,
+            save_strategy='steps',
+            save_steps=args.save_interval,
+            evaluation_strategy='steps',
+            eval_steps=args.eval_iters,
+            per_device_train_batch_size=args.micro_batch_size,
+            max_grad_norm=args.clip_grad,
+            weight_decay=args.weight_decay,
+            bf16=args.bf16,
+            fp16=not args.bf16,
+            gradient_accumulation_steps=args.gradient_accumulation_steps,
+            gradient_checkpointing=args.gradient_checkpointing,
+            logging_steps=args.eval_iters//4,
+            logging_nan_inf_filter=args.logging_nan_inf_filter,
+            ddp_find_unused_parameters=args.ddp_find_unused_parameters,
+        ),
+    )
+
+    if torch.__version__ >= "2" and sys.platform != "win32":
+        model = torch.compile(model)
+
+    trainer.train()
+
+    model.save_pretrained(args.save_path)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/models/mPLUG_Owl/pipeline/utils.py b/models/mPLUG_Owl/pipeline/utils.py
new file mode 100644
index 0000000..5299805
--- /dev/null
+++ b/models/mPLUG_Owl/pipeline/utils.py
@@ -0,0 +1,156 @@
+import math
+import random
+import torch
+import numpy as np
+from icecream import ic
+
+def print_rank_0(message):
+    """If distributed is initialized, print only on rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
+
+ARGS = None
+def set_args(args):
+    global ARGS
+    ARGS = args
+
+def get_args():
+    return ARGS
+
+TOKENIZER = None
+def set_tokenizer(tokenizer):
+    global TOKENIZER
+    TOKENIZER = tokenizer
+
+def get_tokenizer():
+    return TOKENIZER
+from torch import distributed as dist
+
+class worker_init:
+    def __init__(self, epoch_id):
+        self.epoch_id = epoch_id
+    def _worker_init_fn(self, worker_id):
+        random.seed(worker_id + self.epoch_id*1e4 + dist.get_rank()*1e8)
+
+
+def batchify(batch):
+    # collate_fn
+    # image = torch.cat([data["image"] for data in batch], dim=0)
+    image = [data["image"] if data["image"] is not None else None for data in batch]
+    if all([img is None for img in image]):
+        image = None
+    else:
+        image = torch.cat([img for img in image if img is not None], dim=0)
+    num_images_per_sample = torch.LongTensor([data["image"].size(0) if data['image'] is not None else 0 for data in batch])
+
+    text = torch.stack([torch.LongTensor(data["text"]['input_ids']) for data in batch], dim=0)
+    non_padding_mask = torch.stack([torch.LongTensor(data["text"]['non_padding_mask']) for data in batch], dim=0)
+    non_media_mask = torch.stack([torch.LongTensor(data["text"]['non_media_mask']) for data in batch], dim=0)
+    prompt_mask = torch.stack([torch.LongTensor(data["text"]['prompt_mask']) for data in batch], dim=0)
+    prompt_length = torch.from_numpy(np.stack([data["text"]["prompt_length"] for data in batch]))
+    seq_length = torch.from_numpy(np.stack([data["text"]["seq_length"] for data in batch]))
+    
+    output_batch = {
+        "pixel_values": image,
+        "input_ids": text.long(),
+        "labels": text.long().clone(),
+        "num_images": num_images_per_sample.long(),
+        "non_padding_mask": non_padding_mask.long(),
+        "non_media_mask": non_media_mask.long(),
+        "prompt_mask": prompt_mask.long()        
+    }
+    return output_batch
+
+
+def get_param_groups(modules,
+                     no_weight_decay_cond,
+                     scale_lr_cond,
+                     lr_mult):
+    """creates param groups based on weight decay condition (regularized vs non regularized)
+       and learning rate scale condition (args.lr vs lr_mult * args.lr)
+       scale_lr_cond is used during finetuning where head of the network requires a scaled
+       version of the base learning rate. 
+    """
+    wd_no_scale_lr = []
+    wd_scale_lr = []
+    no_wd_no_scale_lr = []
+    no_wd_scale_lr = []
+    for module in modules:
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue
+
+            if no_weight_decay_cond is not None:
+                no_wd = no_weight_decay_cond(name, param)
+            else:
+                # do not regularize biases nor Norm parameters
+                no_wd = name.endswith(".bias") or len(param.shape) == 1
+
+            if scale_lr_cond is not None:
+                scale_lr = scale_lr_cond(name, param)
+            else:
+                scale_lr = False
+
+            if not no_wd and not scale_lr:
+                wd_no_scale_lr.append(param)
+            elif not no_wd and scale_lr:
+                wd_scale_lr.append(param)
+            elif no_wd and not scale_lr:
+                no_wd_no_scale_lr.append(param)
+            else:
+                no_wd_scale_lr.append(param)
+
+    param_groups = []
+    if len(wd_no_scale_lr):
+        param_groups.append(
+            {'params': wd_no_scale_lr, 'wd_mult': 1.0, 'lr_mult': 1.0})
+    if len(wd_scale_lr):
+        param_groups.append(
+            {'params': wd_scale_lr, 'wd_mult': 1.0, 'lr_mult': lr_mult})
+    if len(no_wd_no_scale_lr):
+        param_groups.append({'params': no_wd_no_scale_lr,
+                            'wd_mult': 0.0, 'lr_mult': 1.0})
+    if len(no_wd_scale_lr):
+        param_groups.append(
+            {'params': no_wd_scale_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult})
+
+    return param_groups
+
+def get_cosine_schedule_with_warmup(
+        optimizer, lr, min_lr, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
+    ):
+        """
+        Create a schedule with a learning rate that decreases following the values of the cosine function between the
+        initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+        initial lr set in the optimizer.
+
+        Args:
+            optimizer ([`~torch.optim.Optimizer`]):
+                The optimizer for which to schedule the learning rate.
+            num_warmup_steps (`int`):
+                The number of steps for the warmup phase.
+            num_training_steps (`int`):
+                The total number of training steps.
+            num_cycles (`float`, *optional*, defaults to 0.5):
+                The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+                following a half-cosine).
+            last_epoch (`int`, *optional*, defaults to -1):
+                The index of the last epoch when resuming training.
+
+        Return:
+            `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+        """
+
+        delta_min_lr = (lr-min_lr)/lr  # 0.95
+
+        def lr_lambda(current_step):
+            if current_step < num_warmup_steps:
+                return (1-delta_min_lr) + delta_min_lr * float(current_step) / float(max(1, num_warmup_steps))
+            progress = float(current_step - num_warmup_steps) / \
+                float(max(1, num_training_steps - num_warmup_steps))
+            return delta_min_lr + (1-delta_min_lr) * max(0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+        from torch.optim.lr_scheduler import LambdaLR
+        return LambdaLR(optimizer, lr_lambda, last_epoch)
\ No newline at end of file
diff --git a/models/mPLUG_Owl/requirements.txt b/models/mPLUG_Owl/requirements.txt
new file mode 100644
index 0000000..e79d71e
--- /dev/null
+++ b/models/mPLUG_Owl/requirements.txt
@@ -0,0 +1,19 @@
+transformers==4.28.1
+einops
+icecream
+flask
+ruamel.yaml
+uvicorn
+fastapi
+markdown2
+gradio
+sconf
+tensorboardX
+tensorboard
+h5py
+sentencepiece
+peft
+opencv-python
+decord
+chardet
+cchardet
\ No newline at end of file
diff --git a/models/mPLUG_Owl/scripts/train_it.sh b/models/mPLUG_Owl/scripts/train_it.sh
new file mode 100644
index 0000000..f51d499
--- /dev/null
+++ b/models/mPLUG_Owl/scripts/train_it.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+
+if [ $MASTER_ADDR ];then
+	echo $MASTER_ADDR
+    echo $MASTER_PORT
+    echo $WORLD_SIZE
+    echo $RANK
+else
+	MASTER_ADDR=127.0.0.1
+    MASTER_PORT=2$(($RANDOM % 10))$(($RANDOM % 10))15
+    WORLD_SIZE=1
+    RANK=0
+fi
+
+DISTRIBUTED_ARGS="--nproc_per_node 8 \
+                  --nnodes ${WORLD_SIZE} \
+                  --node_rank ${RANK} \
+                  --master_addr ${MASTER_ADDR} \
+                  --master_port ${MASTER_PORT}"
+
+EXP_NAME=sft_v0.1
+SAVE_NAME=sft_v0.1_ft_grad_ckpt
+
+SAVE_PATH="./output/${SAVE_NAME}/"
+
+max_length=2048
+micro_batch_size=4
+global_batch_size=256
+gradient_accumulation_steps=1
+
+# train_iters = total_data * train_epochs // global_batch_size
+# 361481 * 3 / 256 = 4236
+train_epochs=3
+train_iters=4236
+
+lr_warmup_iters=50
+
+eval_iter=50
+eval_interval=50
+save_interval=500
+
+mkdir -p ${SAVE_PATH}
+
+options=" \
+	--pretrained-ckpt MAGAer13/mplug-owl-llama-7b-pt \
+	--seq-length ${max_length} \
+	--micro-batch-size ${micro_batch_size} \
+	--num-training-steps ${train_iters} \
+    --train-epochs ${train_epochs} \
+	--num-warmup-steps ${lr_warmup_iters} \
+	--gradient-accumulation-steps ${gradient_accumulation_steps} \
+	--lr 2e-5 \
+	--min-lr 1e-6 \
+	--eval-iters ${eval_iter} \
+    --save-interval ${save_interval} \
+	--save-path ${SAVE_PATH} \
+    --clip-grad 1.0 \
+	--weight-decay 0.0001 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.999 \
+	--num-workers 32 \
+	--use-lora \
+	--gradient-checkpointing \
+	--bf16"
+
+multimodal_options=" \
+	--mm-config configs/v0.yaml 
+    "
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./pipeline/train.py $@ ${options} ${multimodal_options} 2>&1 | tee ${SAVE_PATH}/train.log 
\ No newline at end of file
diff --git a/models/mPLUG_Owl/scripts/train_it_wo_lora.sh b/models/mPLUG_Owl/scripts/train_it_wo_lora.sh
new file mode 100644
index 0000000..8e829ca
--- /dev/null
+++ b/models/mPLUG_Owl/scripts/train_it_wo_lora.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+
+if [ $MASTER_ADDR ];then
+	echo $MASTER_ADDR
+    echo $MASTER_PORT
+    echo $WORLD_SIZE
+    echo $RANK
+else
+	MASTER_ADDR=127.0.0.1
+    MASTER_PORT=2$(($RANDOM % 10))$(($RANDOM % 10))15
+    WORLD_SIZE=1
+    RANK=0
+fi
+
+DISTRIBUTED_ARGS="--nproc_per_node 8 \
+                  --nnodes ${WORLD_SIZE} \
+                  --node_rank ${RANK} \
+                  --master_addr ${MASTER_ADDR} \
+                  --master_port ${MASTER_PORT}"
+
+EXP_NAME=sft_v0.1
+SAVE_NAME=sft_v0.1_ft_grad_ckpt
+
+SAVE_PATH="./output/${SAVE_NAME}/"
+
+max_length=2048
+micro_batch_size=1
+global_batch_size=256
+gradient_accumulation_steps=4
+
+# train_iters = total_data * train_epochs // global_batch_size
+# 361481 * 3 / 256 = 4236
+train_epochs=3
+train_iters=4236
+
+lr_warmup_iters=50
+lr_decay_iters=`expr $train_iters - $lr_warmup_iters`
+
+eval_iter=50
+eval_interval=50
+save_interval=500
+
+mkdir -p ${SAVE_PATH}
+
+options=" \
+	--pretrained-ckpt MAGAer13/mplug-owl-llama-7b-pt \
+	--seq-length ${max_length} \
+	--micro-batch-size ${micro_batch_size} \
+    --train-epochs ${train_epochs} \
+	--num-warmup-steps ${lr_warmup_iters} \
+	--num-training-steps ${train_iters} \
+	--gradient-accumulation-steps ${gradient_accumulation_steps} \
+	--lr 1e-5 \
+	--min-lr 1e-6 \
+	--eval-iters ${eval_iter} \
+    --save-interval ${save_interval} \
+	--save-path ${SAVE_PATH} \
+    --clip-grad 1.0 \
+	--weight-decay 0.0001 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.999 \
+	--num-workers 32 \
+	--gradient-checkpointing \
+	--bf16"
+
+multimodal_options=" \
+	--mm-config configs/v0.yaml 
+    "
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./pipeline/train.py $@ ${options} ${multimodal_options} 2>&1 | tee ${SAVE_PATH}/train.log 
\ No newline at end of file
diff --git a/models/mPLUG_Owl/serve/__init__.py b/models/mPLUG_Owl/serve/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/mPLUG_Owl/serve/conversation.py b/models/mPLUG_Owl/serve/conversation.py
new file mode 100644
index 0000000..8c24382
--- /dev/null
+++ b/models/mPLUG_Owl/serve/conversation.py
@@ -0,0 +1,181 @@
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import os
+from decord import VideoReader
+import numpy as np
+from PIL import Image
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "\n "
+    sep2: str = None
+
+    skip_next: bool = False
+
+    def get_prompt(self):
+        self.system = "The following is a conversation between a curious human and AI. The AI gives helpful, detailed, and polite answers to the human's questions."
+        self.sep = "\n"
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _ = message
+                    ret += role.replace("AI", "AI") + ": " + message + self.sep
+                else:
+                    if role != "":
+                        ret += role.replace("AI", "AI") + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def get_index(self, num_frames, num_segments):
+        seg_size = float(num_frames - 1) / num_segments
+        start = int(seg_size / 2)
+        offsets = np.array([
+            start + int(np.round(seg_size * idx)) for idx in range(num_segments)
+        ])
+        return offsets
+
+    def load_video(self, path, num_frames=4):
+        vr = VideoReader(path, height=224, width=224)
+        total_frames = len(vr)
+        frame_indices = self.get_index(total_frames, num_frames)
+        images_group = list()
+        for frame_index in frame_indices:
+            img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
+            images_group.append(img)
+        return images_group
+
+    def get_images(self, log_dir=None):
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+        images = []
+        k = 0
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image = msg
+                    image_tmp = image
+                    if isinstance(image_tmp, str):
+                        image_pils = self.load_video(image_tmp)
+                    else:
+                        image_pils = [image_tmp]
+
+                    for image in image_pils:
+                        buffered = BytesIO()
+                        
+                        image.save(buffered, format="JPEG")
+                        
+                        img_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_str)
+                    k += 1
+        return images
+    
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image = msg
+                    if isinstance(image, str):
+                        with open(image, 'rb') as f:
+                            data = f.read()
+                        img_b64_str = base64.b64encode(data).decode()
+                        image_str = f'<video src="data:video/mp4;base64,{img_b64_str}" controls width="426" height="240"></video>'
+                        msg = msg.replace('\n'.join(['<image>']*4), image_str)
+                    else:
+                        max_hw, min_hw = max(image.size), min(image.size)
+                        aspect_ratio = max_hw / min_hw
+                        max_len, min_len = 800, 400
+                        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                        longest_edge = int(shortest_edge * aspect_ratio)
+                        W, H = image.size
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        image = image.resize((W, H))
+                        # image = image.resize((224, 224))
+                        buffered = BytesIO()
+                        image.save(buffered, format="JPEG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                        msg = msg.replace('<image>', img_str)
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2)
+
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "images": self.get_images(),
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+        
+mplug_owl_v0 = Conversation(
+    system="The following is a conversation between a curious human and assistant AI. The assistant AI gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "AI"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+default_conversation = mplug_owl_v0
+
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())
diff --git a/models/mPLUG_Owl/serve/gradio_css.py b/models/mPLUG_Owl/serve/gradio_css.py
new file mode 100644
index 0000000..5545413
--- /dev/null
+++ b/models/mPLUG_Owl/serve/gradio_css.py
@@ -0,0 +1,73 @@
+code_highlight_css = (
+"""
+#chatbot .hll { background-color: #ffffcc }
+#chatbot .c { color: #408080; font-style: italic }
+#chatbot .err { border: 1px solid #FF0000 }
+#chatbot .k { color: #008000; font-weight: bold }
+#chatbot .o { color: #666666 }
+#chatbot .ch { color: #408080; font-style: italic }
+#chatbot .cm { color: #408080; font-style: italic }
+#chatbot .cp { color: #BC7A00 }
+#chatbot .cpf { color: #408080; font-style: italic }
+#chatbot .c1 { color: #408080; font-style: italic }
+#chatbot .cs { color: #408080; font-style: italic }
+#chatbot .gd { color: #A00000 }
+#chatbot .ge { font-style: italic }
+#chatbot .gr { color: #FF0000 }
+#chatbot .gh { color: #000080; font-weight: bold }
+#chatbot .gi { color: #00A000 }
+#chatbot .go { color: #888888 }
+#chatbot .gp { color: #000080; font-weight: bold }
+#chatbot .gs { font-weight: bold }
+#chatbot .gu { color: #800080; font-weight: bold }
+#chatbot .gt { color: #0044DD }
+#chatbot .kc { color: #008000; font-weight: bold }
+#chatbot .kd { color: #008000; font-weight: bold }
+#chatbot .kn { color: #008000; font-weight: bold }
+#chatbot .kp { color: #008000 }
+#chatbot .kr { color: #008000; font-weight: bold }
+#chatbot .kt { color: #B00040 }
+#chatbot .m { color: #666666 }
+#chatbot .s { color: #BA2121 }
+#chatbot .na { color: #7D9029 }
+#chatbot .nb { color: #008000 }
+#chatbot .nc { color: #0000FF; font-weight: bold }
+#chatbot .no { color: #880000 }
+#chatbot .nd { color: #AA22FF }
+#chatbot .ni { color: #999999; font-weight: bold }
+#chatbot .ne { color: #D2413A; font-weight: bold }
+#chatbot .nf { color: #0000FF }
+#chatbot .nl { color: #A0A000 }
+#chatbot .nn { color: #0000FF; font-weight: bold }
+#chatbot .nt { color: #008000; font-weight: bold }
+#chatbot .nv { color: #19177C }
+#chatbot .ow { color: #AA22FF; font-weight: bold }
+#chatbot .w { color: #bbbbbb }
+#chatbot .mb { color: #666666 }
+#chatbot .mf { color: #666666 }
+#chatbot .mh { color: #666666 }
+#chatbot .mi { color: #666666 }
+#chatbot .mo { color: #666666 }
+#chatbot .sa { color: #BA2121 }
+#chatbot .sb { color: #BA2121 }
+#chatbot .sc { color: #BA2121 }
+#chatbot .dl { color: #BA2121 }
+#chatbot .sd { color: #BA2121; font-style: italic }
+#chatbot .s2 { color: #BA2121 }
+#chatbot .se { color: #BB6622; font-weight: bold }
+#chatbot .sh { color: #BA2121 }
+#chatbot .si { color: #BB6688; font-weight: bold }
+#chatbot .sx { color: #008000 }
+#chatbot .sr { color: #BB6688 }
+#chatbot .s1 { color: #BA2121 }
+#chatbot .ss { color: #19177C }
+#chatbot .bp { color: #008000 }
+#chatbot .fm { color: #0000FF }
+#chatbot .vc { color: #19177C }
+#chatbot .vg { color: #19177C }
+#chatbot .vi { color: #19177C }
+#chatbot .vm { color: #19177C }
+#chatbot .il { color: #666666 }
+""")
+#.highlight  { background: #f8f8f8; }
+
diff --git a/models/mPLUG_Owl/serve/gradio_patch.py b/models/mPLUG_Owl/serve/gradio_patch.py
new file mode 100644
index 0000000..f2ae614
--- /dev/null
+++ b/models/mPLUG_Owl/serve/gradio_patch.py
@@ -0,0 +1,169 @@
+"""
+Adopted from https://github.com/gradio-app/gradio/blob/main/gradio/components.py
+Fix a markdown render problem.
+"""
+from __future__ import annotations
+
+from gradio.components import *
+from markdown2 import Markdown
+
+
+class _Keywords(Enum):
+    NO_VALUE = "NO_VALUE"  # Used as a sentinel to determine if nothing is provided as a argument for `value` in `Component.update()`
+    FINISHED_ITERATING = "FINISHED_ITERATING"  # Used to skip processing of a component's value (needed for generators + state)
+
+
+@document("style")
+# class Chatbot(Changeable, Selectable, IOComponent, JSONSerializable):
+class Chatbot(Changeable, IOComponent, JSONSerializable):
+    """
+    Displays a chatbot output showing both user submitted messages and responses. Supports a subset of Markdown including bold, italics, code, and images.
+    Preprocessing: this component does *not* accept input.
+    Postprocessing: expects function to return a {List[Tuple[str | None | Tuple, str | None | Tuple]]}, a list of tuples with user message and response messages. Messages should be strings, tuples, or Nones. If the message is a string, it can include Markdown. If it is a tuple, it should consist of (string filepath to image/video/audio, [optional string alt text]). Messages that are `None` are not displayed.
+
+    Demos: chatbot_simple, chatbot_multimodal
+    """
+
+    def __init__(
+        self,
+        value: List[Tuple[str | None, str | None]] | Callable | None = None,
+        color_map: Dict[str, str] | None = None,  # Parameter moved to Chatbot.style()
+        *,
+        label: str | None = None,
+        every: float | None = None,
+        show_label: bool = True,
+        visible: bool = True,
+        elem_id: str | None = None,
+        elem_classes: List[str] | str | None = None,
+        **kwargs,
+    ):
+        """
+        Parameters:
+            value: Default value to show in chatbot. If callable, the function will be called whenever the app loads to set the initial value of the component.
+            label: component name in interface.
+            every: If `value` is a callable, run the function 'every' number of seconds while the client connection is open. Has no effect otherwise. Queue must be enabled. The event can be accessed (e.g. to cancel it) via this component's .load_event attribute.
+            show_label: if True, will display label.
+            visible: If False, component will be hidden.
+            elem_id: An optional string that is assigned as the id of this component in the HTML DOM. Can be used for targeting CSS styles.
+            elem_classes: An optional list of strings that are assigned as the classes of this component in the HTML DOM. Can be used for targeting CSS styles.
+        """
+        if color_map is not None:
+            warnings.warn(
+                "The 'color_map' parameter has been deprecated.",
+            )
+        #self.md = utils.get_markdown_parser()
+        self.md = Markdown(extras=["fenced-code-blocks", "tables", "break-on-newline"])
+        self.select: EventListenerMethod
+        """
+        Event listener for when the user selects message from Chatbot.
+        Uses event data gradio.SelectData to carry `value` referring to text of selected message, and `index` tuple to refer to [message, participant] index.
+        See EventData documentation on how to use this event data.
+        """
+
+        IOComponent.__init__(
+            self,
+            label=label,
+            every=every,
+            show_label=show_label,
+            visible=visible,
+            elem_id=elem_id,
+            elem_classes=elem_classes,
+            value=value,
+            **kwargs,
+        )
+
+    def get_config(self):
+        return {
+            "value": self.value,
+            # "selectable": self.selectable,
+            **IOComponent.get_config(self),
+        }
+
+    @staticmethod
+    def update(
+        value: Any | Literal[_Keywords.NO_VALUE] | None = _Keywords.NO_VALUE,
+        label: str | None = None,
+        show_label: bool | None = None,
+        visible: bool | None = None,
+    ):
+        updated_config = {
+            "label": label,
+            "show_label": show_label,
+            "visible": visible,
+            "value": value,
+            "__type__": "update",
+        }
+        return updated_config
+
+    def _process_chat_messages(
+        self, chat_message: str | Tuple | List | Dict | None
+    ) -> str | Dict | None:
+        if chat_message is None:
+            return None
+        elif isinstance(chat_message, (tuple, list)):
+            mime_type = processing_utils.get_mimetype(chat_message[0])
+            return {
+                "name": chat_message[0],
+                "mime_type": mime_type,
+                "alt_text": chat_message[1] if len(chat_message) > 1 else None,
+                "data": None,  # These last two fields are filled in by the frontend
+                "is_file": True,
+            }
+        elif isinstance(
+            chat_message, dict
+        ):  # This happens for previously processed messages
+            return chat_message
+        elif isinstance(chat_message, str):
+            #return self.md.render(chat_message)
+            return str(self.md.convert(chat_message))
+        else:
+            raise ValueError(f"Invalid message for Chatbot component: {chat_message}")
+
+    def postprocess(
+        self,
+        y: List[
+            Tuple[str | Tuple | List | Dict | None, str | Tuple | List | Dict | None]
+        ],
+    ) -> List[Tuple[str | Dict | None, str | Dict | None]]:
+        """
+        Parameters:
+            y: List of tuples representing the message and response pairs. Each message and response should be a string, which may be in Markdown format.  It can also be a tuple whose first element is a string filepath or URL to an image/video/audio, and second (optional) element is the alt text, in which case the media file is displayed. It can also be None, in which case that message is not displayed.
+        Returns:
+            List of tuples representing the message and response. Each message and response will be a string of HTML, or a dictionary with media information.
+        """
+        if y is None:
+            return []
+        processed_messages = []
+        for message_pair in y:
+            assert isinstance(
+                message_pair, (tuple, list)
+            ), f"Expected a list of lists or list of tuples. Received: {message_pair}"
+            assert (
+                len(message_pair) == 2
+            ), f"Expected a list of lists of length 2 or list of tuples of length 2. Received: {message_pair}"
+            processed_messages.append(
+                (
+                    #self._process_chat_messages(message_pair[0]),
+                    '<pre style="font-family: var(--font)">' +
+                    message_pair[0] + "</pre>",
+                    self._process_chat_messages(message_pair[1]),
+                )
+            )
+        return processed_messages
+
+    def style(self, height: int | None = None, **kwargs):
+        """
+        This method can be used to change the appearance of the Chatbot component.
+        """
+        if height is not None:
+            self._style["height"] = height
+        if kwargs.get("color_map") is not None:
+            warnings.warn("The 'color_map' parameter has been deprecated.")
+
+        Component.style(
+            self,
+            **kwargs,
+        )
+        return self
+
+
diff --git a/models/mPLUG_Owl/serve/io_utils.py b/models/mPLUG_Owl/serve/io_utils.py
new file mode 100644
index 0000000..2ecbcf9
--- /dev/null
+++ b/models/mPLUG_Owl/serve/io_utils.py
@@ -0,0 +1,449 @@
+# -*- coding: utf-8 -*- 
+"""
+@Time : 2023-03-23 11:42 
+@Author : zhimiao.chh 
+@Desc : 
+"""
+
+import re
+import os
+import sys
+import shutil
+import hashlib
+from io import StringIO, BytesIO
+from contextlib import contextmanager
+from typing import List
+from datetime import datetime, timedelta
+
+
+class IO:
+    @staticmethod
+    def register(options):
+        pass
+
+    def open(self, path: str, mode: str):
+        raise NotImplementedError
+
+    def exists(self, path: str) -> bool:
+        raise NotImplementedError
+
+    def move(self, src: str, dst: str):
+        raise NotImplementedError
+
+    def copy(self, src: str, dst: str):
+        raise NotImplementedError
+
+    def makedirs(self, path: str, exist_ok=True):
+        raise NotImplementedError
+
+    def remove(self, path: str):
+        raise NotImplementedError
+
+    def listdir(self, path: str, recursive=False, full_path=False, contains=None):
+        raise NotImplementedError
+
+    def isdir(self, path: str) -> bool:
+        raise NotImplementedError
+
+    def isfile(self, path: str) -> bool:
+        raise NotImplementedError
+
+    def abspath(self, path: str) -> str:
+        raise NotImplementedError
+
+    def last_modified(self, path: str) -> datetime:
+        raise NotImplementedError
+
+    def md5(self, path: str) -> str:
+        hash_md5 = hashlib.md5()
+        with self.open(path, 'rb') as f:
+            for chunk in iter(lambda: f.read(4096), b''):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()
+
+    re_remote = re.compile(r'(oss|https?)://')
+
+    def islocal(self, path: str) -> bool:
+        return not self.re_remote.match(path.lstrip())
+
+
+class DefaultIO(IO):
+    __name__ = 'DefaultIO'
+
+    def _check_path(self, path):
+        if not self.islocal(path):
+            raise RuntimeError(
+                'Credentials must be provided to use oss path. '
+                'Make sure you have created "user/modules/oss_credentials.py" according to ReadMe.')
+
+    def open(self, path, mode='r'):
+        self._check_path(path)
+        path = self.abspath(path)
+        return open(path, mode=mode)
+
+    def exists(self, path):
+        self._check_path(path)
+        path = self.abspath(path)
+        return os.path.exists(path)
+
+    def move(self, src, dst):
+        self._check_path(src)
+        self._check_path(dst)
+        src = self.abspath(src)
+        dst = self.abspath(dst)
+        shutil.move(src, dst)
+
+    def copy(self, src, dst):
+        self._check_path(src)
+        self._check_path(dst)
+        src = self.abspath(src)
+        dst = self.abspath(dst)
+        try:
+            shutil.copyfile(src, dst)
+        except shutil.SameFileError:
+            pass
+
+    def makedirs(self, path, exist_ok=True):
+        self._check_path(path)
+        path = self.abspath(path)
+        os.makedirs(path, exist_ok=exist_ok)
+
+    def remove(self, path):
+        self._check_path(path)
+        path = self.abspath(path)
+        if os.path.isdir(path):
+            shutil.rmtree(path)
+        else:
+            os.remove(path)
+
+    def listdir(self, path, recursive=False, full_path=False, contains=None):
+        self._check_path(path)
+        path = self.abspath(path)
+        contains = contains or ''
+        if recursive:
+            files = (os.path.join(dp, f) if full_path else f for dp, dn, fn in os.walk(path) for f in fn)
+            files = [file for file in files if contains in file]
+        else:
+            files = os.listdir(path)
+            if full_path:
+                files = [os.path.join(path, file) for file in files if contains in file]
+        return files
+
+    def isdir(self, path):
+        return os.path.isdir(path)
+
+    def isfile(self, path):
+        return os.path.isfile(path)
+
+    def abspath(self, path):
+        return os.path.abspath(path)
+
+    def last_modified(self, path):
+        return datetime.fromtimestamp(os.path.getmtime(path))
+
+
+class OSS(DefaultIO):
+    "Mixed IO module to support both system-level and OSS IO methods"
+    __name__ = 'OSS'
+
+    def __init__(self, access_key_id: str, access_key_secret: str, region_bucket: List[List[str]]):
+        """
+        the value of "region_bucket" should be something like [["cn-hangzhou", "<yourBucketName>"], ["cn-zhangjiakou", "<yourBucketName>"]],
+        specifying your buckets and corresponding regions
+        """
+        from oss2 import Auth, Bucket, ObjectIterator
+        super().__init__()
+        self.ObjectIterator = ObjectIterator
+        self.auth = Auth(access_key_id, access_key_secret)
+        self.buckets = {
+            bucket_name: Bucket(self.auth, f'http://oss-{region}.aliyuncs.com', bucket_name)
+            for region, bucket_name in region_bucket
+        }
+        self.oss_pattern = re.compile(r'oss://([^/]+)/(.+)')
+
+    def _split_name(self, path):
+        m = self.oss_pattern.match(path)
+        if not m:
+            raise IOError(f'invalid oss path: "{path}", should be "oss://<bucket_name>/path"')
+        bucket_name, path = m.groups()
+        return bucket_name, path
+
+    def _split(self, path):
+        bucket_name, path = self._split_name(path)
+        try:
+            bucket = self.buckets[bucket_name]
+        except KeyError:
+            raise IOError(f'Bucket {bucket_name} not registered in oss_credentials.py')
+        return bucket, path
+
+    def open(self, full_path, mode='r'):
+        if not full_path.startswith('oss://'):
+            return super().open(full_path, mode)
+
+        bucket, path = self._split(full_path)
+        with mute_stderr():
+            path_exists = bucket.object_exists(path)
+        if 'w' in mode:
+            if path_exists:
+                bucket.delete_object(path)
+            if 'b' in mode:
+                return BinaryOSSFile(bucket, path)
+            return OSSFile(bucket, path)
+        elif mode == 'a':
+            position = bucket.head_object(path).content_length if path_exists else 0
+            return OSSFile(bucket, path, position=position)
+        else:
+            if not path_exists:
+                raise FileNotFoundError(full_path)
+            obj = bucket.get_object(path)
+            # auto cache large files to avoid memory issues
+            # if obj.content_length > 30 * 1024 ** 2:  # 30M
+            #     from da.utils import cache_file
+            #     path = cache_file(full_path)
+            #     return super().open(path, mode)
+            if mode == 'rb':
+                # TODO for a large file, this will load the whole file into memory
+                return NullContextWrapper(BytesIO(obj.read()))
+            else:
+                assert mode == 'r'
+                return NullContextWrapper(StringIO(obj.read().decode()))
+
+    def exists(self, path):
+        if not path.startswith('oss://'):
+            return super().exists(path)
+
+        bucket, _path = self._split(path)
+        # if file exists
+        exists = self._file_exists(bucket, _path)
+        # if directory exists
+        if not exists:
+            try:
+                self.listdir(path)
+                exists = True
+            except FileNotFoundError:
+                pass
+        return exists
+
+    def _file_exists(self, bucket, path):
+        with mute_stderr():
+            return bucket.object_exists(path)
+
+    def move(self, src, dst):
+        if not src.startswith('oss://') and not dst.startswith('oss://'):
+            return super().move(src, dst)
+        self.copy(src, dst)
+        self.remove(src)
+
+    def copy(self, src, dst):
+        cloud_src = src.startswith('oss://')
+        cloud_dst = dst.startswith('oss://')
+        if not cloud_src and not cloud_dst:
+            return super().copy(src, dst)
+
+        # download
+        if cloud_src and not cloud_dst:
+            bucket, src = self._split(src)
+            obj = bucket.get_object(src)
+            if obj.content_length > 100 * 1024 ** 2:  # 100M
+                from tqdm import tqdm
+                progress = None
+
+                def callback(i, n):
+                    nonlocal progress
+                    if progress is None:
+                        progress = tqdm(total=n, unit='B', unit_scale=True, unit_divisor=1024, leave=False,
+                                        desc=f'downloading')
+                    progress.update(i - progress.n)
+
+                bucket.get_object_to_file(src, dst, progress_callback=callback)
+                if progress is not None:
+                    progress.close()
+            else:
+                bucket.get_object_to_file(src, dst)
+            return
+        bucket, dst = self._split(dst)
+        # upload
+        if cloud_dst and not cloud_src:
+            bucket.put_object_from_file(dst, src)
+            return
+        # copy between oss paths
+        if src != dst:
+            src_bucket_name, src = self._split_name(src)
+            bucket.copy_object(src_bucket_name, src, dst)
+        # TODO: support large file copy
+        # https://help.aliyun.com/document_detail/88465.html?spm=a2c4g.11174283.6.882.4d157da2mgp3xc
+
+    def listdir(self, path, recursive=False, full_path=False, contains=None):
+        if not path.startswith('oss://'):
+            return super().listdir(path, recursive, full_path, contains)
+
+        bucket, path = self._split(path)
+        path = path.rstrip('/') + '/'
+        files = [obj.key for obj in self.ObjectIterator(bucket, prefix=path, delimiter='' if recursive else '/')]
+        try:
+            files.remove(path)
+        except ValueError:
+            pass
+        if full_path:
+            files = [f'oss://{bucket.bucket_name}/{file}' for file in files]
+        else:
+            files = [file[len(path):] for file in files]
+        if not files:
+            raise FileNotFoundError(f'No such directory: oss://{bucket.bucket_name}/{path}')
+        files = [file for file in files if (contains or '') in file]
+        return files
+
+    def remove(self, path):
+        if not path.startswith('oss://'):
+            return super().remove(path)
+
+        if self.isfile(path):
+            paths = [path]
+        else:
+            paths = self.listdir(path, recursive=True, full_path=True)
+        for path in paths:
+            bucket, path = self._split(path)
+            bucket.delete_object(path)
+
+    def makedirs(self, path, exist_ok=True):
+        # there is no need to create directory in oss
+        if not path.startswith('oss://'):
+            return super().makedirs(path)
+
+    def isdir(self, path):
+        if not path.startswith('oss://'):
+            return super().isdir(path)
+        return self.exists(path.rstrip('/') + '/')
+
+    def isfile(self, path):
+        if not path.startswith('oss://'):
+            return super().isdir(path)
+        return self.exists(path) and not self.isdir(path)
+
+    def abspath(self, path):
+        if not path.startswith('oss://'):
+            return super().abspath(path)
+        return path
+
+    def authorize(self, path):
+        if not path.startswith('oss://'):
+            raise ValueError('Only oss path can use "authorize"')
+        import oss2
+        bucket, path = self._split(path)
+        bucket.put_object_acl(path, oss2.OBJECT_ACL_PUBLIC_READ)
+
+    def last_modified(self, path):
+        if not path.startswith('oss://'):
+            return super().last_modified(path)
+        bucket, path = self._split(path)
+        return datetime.strptime(
+            bucket.get_object_meta(path).headers['Last-Modified'],
+            r'%a, %d %b %Y %H:%M:%S %Z'
+        ) + timedelta(hours=8)
+
+
+class OSSFile:
+    def __init__(self, bucket, path, position=0):
+        self.position = position
+        self.bucket = bucket
+        self.path = path
+        self.buffer = StringIO()
+
+    def write(self, content):
+        # without a "with" statement, the content is written immediately without buffer
+        # when writing a large batch of contents at a time, this will be quite slow
+        import oss2
+        buffer = self.buffer.getvalue()
+        if buffer:
+            content = buffer + content
+            self.buffer.close()
+            self.buffer = StringIO()
+        try:
+            result = self.bucket.append_object(self.path, self.position, content)
+            self.position = result.next_position
+        except oss2.exceptions.PositionNotEqualToLength:
+            raise RuntimeError(
+                f'Race condition detected. It usually means multiple programs were writing to the same file'
+                f'oss://{self.bucket.bucket_name}/{self.path} (Error 409: PositionNotEqualToLength)')
+        except (oss2.exceptions.RequestError, oss2.exceptions.ServerError) as e:
+            self.buffer.write(content)
+            sys.stderr.write(str(e) + f'when writing to oss://{self.bucket.bucket_name}/{self.path}. Content buffered.')
+
+    def flush(self):
+        "Dummy method for compatibility."
+        pass
+
+    def close(self):
+        "Dummy method for compatibility."
+        pass
+
+    def seek(self, position):
+        self.position = position
+
+    def __enter__(self):
+        return self.buffer
+
+    def __exit__(self, *args):
+        import oss2
+        try:
+            self.bucket.append_object(self.path, self.position, self.buffer.getvalue())
+        except oss2.exceptions.RequestError as e:
+            # TODO test whether this works
+            if 'timeout' not in str(e):
+                raise e
+            # retry if timeout
+            import time
+            time.sleep(5)
+            self.bucket.append_object(self.path, self.position, self.buffer.getvalue())
+
+
+class BinaryOSSFile:
+    def __init__(self, bucket, path):
+        self.bucket = bucket
+        self.path = path
+        self.buffer = BytesIO()
+
+    def __enter__(self):
+        return self.buffer
+
+    def __exit__(self, *args):
+        self.bucket.put_object(self.path, self.buffer.getvalue())
+
+
+class NullContextWrapper:
+    def __init__(self, obj):
+        self._obj = obj
+
+    def __getattr__(self, name):
+        return getattr(self._obj, name)
+
+    def __iter__(self):
+        return self._obj.__iter__()
+
+    def __next__(self):
+        return self._obj.__next__()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        pass
+
+
+@contextmanager
+def ignore_io_error(msg=''):
+    import oss2
+    try:
+        yield
+    except (oss2.exceptions.RequestError, oss2.exceptions.ServerError) as e:
+        sys.stderr.write(str(e) + ' ' + msg)
+
+
+@contextmanager
+def mute_stderr():
+    cache = sys.stderr
+    sys.stderr = StringIO()
+    try:
+        yield None
+    finally:
+        sys.stderr = cache
\ No newline at end of file
diff --git a/models/mPLUG_Owl/serve/model_utils.py b/models/mPLUG_Owl/serve/model_utils.py
new file mode 100644
index 0000000..c44f6de
--- /dev/null
+++ b/models/mPLUG_Owl/serve/model_utils.py
@@ -0,0 +1,92 @@
+import os
+import sys
+import re
+import torch
+import transformers
+import traceback
+
+from queue import Queue
+from threading import Thread
+
+
+def post_process_output(text):
+    text = text.strip()
+    pattern = re.compile(
+        r"<unk>|<pad>|<s>|</s>|\[PAD\]|<\|endoftext\|>|\[UNK\]|\[CLS\]|\[MASK\]|<\|startofpiece\|>|<\|endofpiece\|>|\[gMASK\]|\[sMASK\]"
+    )
+    text = pattern.sub("", text.strip()).strip()
+    return text
+
+
+def post_process_code(code):
+    sep = "\n```"
+    if sep in code:
+        blocks = code.split(sep)
+        if len(blocks) % 2 == 1:
+            for i in range(1, len(blocks), 2):
+                blocks[i] = blocks[i].replace("\\_", "_")
+        code = sep.join(blocks)
+    return code
+
+
+class Stream(transformers.StoppingCriteria):
+    def __init__(self, callback_func=None):
+        self.callback_func = callback_func
+
+    def __call__(self, input_ids, scores) -> bool:
+        if self.callback_func is not None:
+            self.callback_func(input_ids[0])
+        return False
+
+
+class Iteratorize:
+
+    """
+    Transforms a function that takes a callback
+    into a lazy iterator (generator).
+    """
+
+    def __init__(self, func, kwargs={}, callback=None):
+        self.mfunc = func
+        self.c_callback = callback
+        self.q = Queue()
+        self.sentinel = object()
+        self.kwargs = kwargs
+        self.stop_now = False
+
+        def _callback(val):
+            if self.stop_now:
+                raise ValueError
+            self.q.put(val)
+
+        def gentask():
+            try:
+                ret = self.mfunc(callback=_callback, **self.kwargs)
+            except ValueError:
+                pass
+            except:
+                traceback.print_exc()
+                pass
+
+            self.q.put(self.sentinel)
+            if self.c_callback:
+                self.c_callback(ret)
+
+        self.thread = Thread(target=gentask)
+        self.thread.start()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        obj = self.q.get(True, None)
+        if obj is self.sentinel:
+            raise StopIteration
+        else:
+            return obj
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop_now = True
\ No newline at end of file
diff --git a/models/mPLUG_Owl/serve/model_worker.py b/models/mPLUG_Owl/serve/model_worker.py
new file mode 100644
index 0000000..4c4de74
--- /dev/null
+++ b/models/mPLUG_Owl/serve/model_worker.py
@@ -0,0 +1,171 @@
+from PIL import Image
+import torch
+import gradio as gr
+import logging
+import sys
+import os
+import json
+import requests
+import datetime
+import uuid
+import base64
+from io import BytesIO
+import time
+import sys
+sys.path.append("..")
+
+from .io_utils import IO, DefaultIO, OSS
+
+import transformers
+from mplug_owl.processing_mplug_owl import MplugOwlProcessor, MplugOwlImageProcessor
+from mplug_owl.modeling_mplug_owl import MplugOwlForConditionalGeneration
+from mplug_owl.configuration_mplug_owl import MplugOwlConfig
+from mplug_owl.tokenization_mplug_owl import MplugOwlTokenizer
+from transformers import GenerationConfig
+
+from .model_utils import post_process_output, Stream, Iteratorize
+
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+
+class mPLUG_Owl_Server:
+    def __init__(
+        self, 
+        base_model='MAGAer13/mplug-owl-llama-7b',
+        log_dir='./',
+        load_in_8bit=False,
+        bf16=True,
+        device="cuda",
+        io=None
+    ):
+        self.log_dir = log_dir
+        self.image_processor= MplugOwlImageProcessor.from_pretrained(base_model)
+        self.tokenizer= MplugOwlTokenizer.from_pretrained(base_model)
+        self.processor = MplugOwlProcessor(self.image_processor, self.tokenizer)
+        self.model = MplugOwlForConditionalGeneration.from_pretrained(
+            base_model,
+            load_in_8bit=load_in_8bit,
+            torch_dtype=torch.bfloat16 if bf16 else torch.half,
+            device_map="auto"
+        )
+        self.tokenizer = self.processor.tokenizer
+        self.bf16 = bf16
+        self.load_in_8bit = load_in_8bit
+
+        if not load_in_8bit:
+            if bf16:
+                self.model.bfloat16()
+            else:
+                self.model.half()
+        self.model.eval()
+
+        self.io = io
+
+    def evaluate(
+        self, 
+        pixel_values=None,
+        input_ids=None,
+        temperature=1.0,
+        top_p=0.9,
+        top_k=5,
+        num_beams=3,
+        max_new_tokens=256,
+        stream_output=True,
+        length_penalty=1.0,
+        no_repeat_ngram_size=2,
+        do_sample=False,
+        early_stopping=True,
+        **kwargs
+    ):
+        generation_config = dict(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            num_beams=num_beams,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            do_sample=do_sample,
+            early_stopping=early_stopping,
+            length_penalty=length_penalty,
+        )
+        
+        generate_params = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "return_dict_in_generate": True,
+            "output_scores": True,
+            "max_new_tokens": max_new_tokens,
+        }
+        generate_params.update(generation_config)
+        
+        if stream_output:
+            # Stream the reply 1 token at a time.
+            # This is based on the trick of using 'stopping_criteria' to create an iterator,
+            # from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/text_generation.py#L216-L243.
+
+            def generate_with_callback(callback=None, **kwargs):
+                kwargs.setdefault(
+                    "stopping_criteria", transformers.StoppingCriteriaList()
+                )
+                kwargs["stopping_criteria"].append(Stream(callback_func=callback))
+                with torch.no_grad():
+                    self.model.generate(**kwargs)
+
+            def generate_with_streaming(**kwargs):
+                return Iteratorize(generate_with_callback, kwargs, callback=None)
+
+            with generate_with_streaming(**generate_params) as generator:
+                for output in generator:
+                    # new_tokens = len(output) - len(input_ids[0])
+                    decoded_output = self.tokenizer.decode(output)
+
+                    if output[-1] in [self.tokenizer.eos_token_id]:
+                        break
+
+                    yield post_process_output(decoded_output)
+            return  # early return for stream_output
+        
+        with torch.no_grad():
+            generation_output = self.model.generate(
+                pixel_values=pixel_values,
+                input_ids=input_ids,
+                return_dict_in_generate=True,
+                output_scores=True,
+                max_new_tokens=max_new_tokens,
+                **generation_config
+            )
+        s = generation_output.sequences[0].cpu()
+        output = self.tokenizer.decode(s)
+        yield post_process_output(output)
+
+
+    def predict(self, data):
+        prompt = [data['text_input']]
+        images = data['images'] if len(data['images']) > 0 else None
+        if images:
+            images = [Image.open(BytesIO(base64.b64decode(image))) for image in images]
+        inputs = self.processor(text=prompt, images=images, return_tensors='pt')
+
+        input_ids = inputs['input_ids'].to(self.model.device)
+        if 'pixel_values' in inputs:
+            if self.load_in_8bit:
+                pixel_values = inputs['pixel_values'].half().to(self.model.device)
+            elif self.bf16:
+                pixel_values = inputs['pixel_values'].bfloat16().to(self.model.device)
+            else:
+                pixel_values = inputs['pixel_values'].half().to(self.model.device)
+        else:
+            pixel_values = None
+
+        cache = None
+        
+        try:
+            for x in self.evaluate(pixel_values, input_ids, stream_output=True, **data['generation_config']):
+                cache = x
+                yield (x, True)
+        except ValueError as e:
+            print("Caught ValueError:", e)
+            yield (server_error_msg, False)
+        except torch.cuda.CudaError as e:
+            print("Caught torch.cuda.CudaError:", e)
+            yield (server_error_msg, False)
+        
+        return
diff --git a/models/mPLUG_Owl/serve/serve_utils.py b/models/mPLUG_Owl/serve/serve_utils.py
new file mode 100644
index 0000000..802ee8b
--- /dev/null
+++ b/models/mPLUG_Owl/serve/serve_utils.py
@@ -0,0 +1,117 @@
+from PIL import Image
+import torch
+import gradio as gr
+import logging
+import sys
+import os
+import json
+import requests
+from .conversation import default_conversation
+from .gradio_patch import Chatbot as grChatbot
+from .gradio_css import code_highlight_css
+import datetime
+import uuid
+import base64
+from io import BytesIO
+import time
+
+from .io_utils import IO, DefaultIO, OSS
+
+
+handler = None
+
+
+class _IOWrapper:
+    def __init__(self):
+        self._io = DefaultIO()
+
+    def set_io(self, new_io):
+        self._io = new_io
+
+    def __getattr__(self, name):
+        if hasattr(self._io, name):
+            return getattr(self._io, name)
+        return super().__getattr__(name)
+
+    def __str__(self):
+        return self._io.__name__
+
+def init():
+    io = _IOWrapper()
+    return io
+
+
+def vote_last_response(state, vote_type, model_selector, request: gr.Request):
+    pass
+
+def upvote_last_response(state, model_selector, request: gr.Request):
+    vote_last_response(state, "upvote", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+
+def downvote_last_response(state, model_selector, request: gr.Request):
+    vote_last_response(state, "downvote", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+
+def flag_last_response(state, model_selector, request: gr.Request):
+    vote_last_response(state, "flag", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+
+def regenerate(state, request: gr.Request):
+    state.messages[-1][-1] = None
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
+
+def clear_history(request: gr.Request):
+    state = default_conversation.copy()
+    return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
+
+
+def add_text(state, text, image, video, request: gr.Request):
+    if len(text) <= 0 and (image is None or video is None):
+        state.skip_next = True
+        return (state, state.to_gradio_chatbot(), "", None, None) + (no_change_btn,) * 5
+
+    if image is not None:
+        if '<image>' not in text:
+            text = text + '\n<image>'
+        text = (text, image)
+    
+    if video is not None:
+        num_frames = 4
+        if '<image>' not in text:
+            text = text + '\n<image>' * num_frames
+        text = (text, video)
+
+    state.append_message(state.roles[0], text)
+    state.append_message(state.roles[1], None)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
+
+def after_process_image(prompt):
+    prompt = prompt.replace("\n<image>", "<image>")
+    pro_prompt = ""
+    prompt = prompt.split("\n")
+    for p in prompt:
+        if p.count("<image>") > 0:
+            pro_prompt += "Human: <image>\n"
+            if p != "":
+                pro_prompt += p.replace("<image>", "") + "\n"
+        else:
+            pro_prompt += p + "\n"
+    return (pro_prompt[:-1]+" ").replace("\n Human", "\nHuman").replace("\n AI", "\nAI")
+
+
+headers = {"User-Agent": "mPLUG-Owl Client"}
+
+no_change_btn = gr.Button.update()
+enable_btn = gr.Button.update(interactive=True)
+disable_btn = gr.Button.update(interactive=False)
+
+get_window_url_params = """
+function() {
+    const params = new URLSearchParams(window.location.search);
+    url_params = Object.fromEntries(params);
+    console.log(url_params);
+    return url_params;
+    }
+"""
\ No newline at end of file
diff --git a/models/mPLUG_Owl/serve/web_server.py b/models/mPLUG_Owl/serve/web_server.py
new file mode 100644
index 0000000..5598b7d
--- /dev/null
+++ b/models/mPLUG_Owl/serve/web_server.py
@@ -0,0 +1,400 @@
+import argparse
+import datetime
+import json
+import os
+import time
+import torch
+
+import gradio as gr
+import requests
+
+from .conversation import default_conversation
+from .gradio_css import code_highlight_css
+from .gradio_patch import Chatbot as grChatbot
+from .serve_utils import (
+    add_text, after_process_image, disable_btn, no_change_btn,
+    downvote_last_response, enable_btn, flag_last_response,
+    get_window_url_params, init, regenerate, upvote_last_response
+)
+from .model_worker import mPLUG_Owl_Server
+from .model_utils import post_process_code
+
+SHARED_UI_WARNING = f'''### [NOTE] You can duplicate and use it with a paid private GPU.
+<a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/MAGAer13/mPLUG-Owl?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-md.svg" alt="Duplicate Space"></a>
+'''
+
+def load_demo(url_params, request: gr.Request):
+
+    dropdown_update = gr.Dropdown.update(visible=True)
+    state = default_conversation.copy()
+
+    return (state,
+            dropdown_update,
+            gr.Chatbot.update(visible=True),
+            gr.Textbox.update(visible=True),
+            gr.Button.update(visible=True),
+            gr.Row.update(visible=True),
+            gr.Accordion.update(visible=True))
+
+def clear_history(request: gr.Request):
+    state = default_conversation.copy()
+
+    return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
+
+def http_bot(state, max_output_tokens, temperature, top_k, top_p, 
+            num_beams, no_repeat_ngram_size, length_penalty,
+            do_sample, request: gr.Request):
+    if state.skip_next:
+        # This generate call is skipped due to invalid inputs
+        yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
+        return
+
+    prompt = after_process_image(state.get_prompt())
+    images = state.get_images()
+
+    data = {
+        "text_input": prompt,
+        "images": images if len(images) > 0 else [],
+        "generation_config": {
+            "top_k": int(top_k),
+            "top_p": float(top_p),
+            "num_beams": int(num_beams),
+            "no_repeat_ngram_size": int(no_repeat_ngram_size),
+            "length_penalty": float(length_penalty),
+            "do_sample": bool(do_sample),
+            "temperature": float(temperature),
+            "max_new_tokens": min(int(max_output_tokens), 1536),
+        }
+    }
+
+    state.messages[-1][-1] = "▌"
+    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+
+    try:
+        for chunk in model.predict(data):
+            if chunk:
+                if chunk[1]:
+                    output = chunk[0].strip()
+                    output = post_process_code(output)
+                    state.messages[-1][-1] = output + "▌"
+                    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+                else:
+                    output = chunk[0].strip()
+                    state.messages[-1][-1] = output
+                    yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+                    return
+                time.sleep(0.03)
+
+    except requests.exceptions.RequestException as e:
+        state.messages[-1][-1] = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+        yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+        return
+
+    state.messages[-1][-1] = state.messages[-1][-1][:-1]
+    yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
+
+
+def add_text_http_bot(
+    state, text, image, video, 
+    max_output_tokens, temperature, top_k, top_p, 
+    num_beams, no_repeat_ngram_size, length_penalty,
+    do_sample, request: gr.Request):
+    if len(text) <= 0 and (image is None or video is None):
+        state.skip_next = True
+        return (state, state.to_gradio_chatbot(), "", None, None) + (no_change_btn,) * 5
+
+    if image is not None:
+        if '<image>' not in text:
+            text = text + '\n<image>'
+        text = (text, image)
+    
+    if video is not None:
+        num_frames = 4
+        if '<image>' not in text:
+            text = text + '\n<image>' * num_frames
+        text = (text, video)
+
+    state.append_message(state.roles[0], text)
+    state.append_message(state.roles[1], None)
+    state.skip_next = False
+
+    yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
+
+    if state.skip_next:
+        # This generate call is skipped due to invalid inputs
+        yield (state, state.to_gradio_chatbot(), "", None, None) + (no_change_btn,) * 5
+        return
+
+    prompt = after_process_image(state.get_prompt())
+    images = state.get_images()
+
+    data = {
+        "text_input": prompt,
+        "images": images if len(images) > 0 else [],
+        "generation_config": {
+            "top_k": int(top_k),
+            "top_p": float(top_p),
+            "num_beams": int(num_beams),
+            "no_repeat_ngram_size": int(no_repeat_ngram_size),
+            "length_penalty": float(length_penalty),
+            "do_sample": bool(do_sample),
+            "temperature": float(temperature),
+            "max_new_tokens": min(int(max_output_tokens), 1536),
+        }
+    }
+
+    state.messages[-1][-1] = "▌"
+    yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
+
+    try:
+        for chunk in model.predict(data):
+            if chunk:
+                if chunk[1]:
+                    output = chunk[0].strip()
+                    output = post_process_code(output)
+                    state.messages[-1][-1] = output + "▌"
+                    yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
+                else:
+                    output = chunk[0].strip()
+                    state.messages[-1][-1] = output
+                    yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+                    return
+                time.sleep(0.03)
+
+    except requests.exceptions.RequestException as e:
+        state.messages[-1][-1] = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+        yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+        return
+
+    state.messages[-1][-1] = state.messages[-1][-1][:-1]
+    yield (state, state.to_gradio_chatbot(), "", None, None) + (enable_btn,) * 5
+
+
+def regenerate_http_bot(state, 
+    max_output_tokens, temperature, top_k, top_p, 
+    num_beams, no_repeat_ngram_size, length_penalty,
+    do_sample, request: gr.Request):
+    state.messages[-1][-1] = None
+    state.skip_next = False
+    yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
+
+    prompt = after_process_image(state.get_prompt())
+    images = state.get_images()
+
+    data = {
+        "text_input": prompt,
+        "images": images if len(images) > 0 else [],
+        "generation_config": {
+            "top_k": int(top_k),
+            "top_p": float(top_p),
+            "num_beams": int(num_beams),
+            "no_repeat_ngram_size": int(no_repeat_ngram_size),
+            "length_penalty": float(length_penalty),
+            "do_sample": bool(do_sample),
+            "temperature": float(temperature),
+            "max_new_tokens": min(int(max_output_tokens), 1536),
+        }
+    }
+
+    state.messages[-1][-1] = "▌"
+    yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
+
+    try:
+        for chunk in model.predict(data):
+            if chunk:
+                if chunk[1]:
+                    output = chunk[0].strip()
+                    output = post_process_code(output)
+                    state.messages[-1][-1] = output + "▌"
+                    yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
+                else:
+                    output = chunk[0].strip()
+                    state.messages[-1][-1] = output
+                    yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+                    return
+                time.sleep(0.03)
+
+    except requests.exceptions.RequestException as e:
+        state.messages[-1][-1] = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+        yield (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+        return
+
+    state.messages[-1][-1] = state.messages[-1][-1][:-1]
+    yield (state, state.to_gradio_chatbot(), "", None, None) + (enable_btn,) * 5
+
+# [![Star on GitHub](https://img.shields.io/github/stars/X-PLUG/mPLUG-Owl.svg?style=social)](https://github.com/X-PLUG/mPLUG-Owl/stargazers)
+# **If you are facing ERROR, it might be Out-Of-Memory (OOM) issue due to the limited GPU memory, please refresh the page to restart.** Besides, we recommand you to duplicate the space with a single A10 GPU to have a better experience. Or you can visit our demo hosted on [Modelscope](https://www.modelscope.cn/studios/damo/mPLUG-Owl/summary) which is hosted on a V100 machine.
+
+title_markdown = ("""
+<h1 align="center"><a href="https://github.com/X-PLUG/mPLUG-Owl"><img src="https://s1.ax1x.com/2023/05/12/p9yGA0g.png", alt="mPLUG-Owl" border="0" style="margin: 0 auto; height: 200px;" /></a> </h1>
+
+<h2 align="center"> mPLUG-Owl🦉: Modularization Empowers Large Language Models with Multimodality </h2>
+
+<h5 align="center"> If you like our project, please give us a star ✨ on Github for latest update.  </h2>
+
+<div align="center">
+    <div style="display:flex; gap: 0.25rem;" align="center">
+        <a href='https://github.com/X-PLUG/mPLUG-Owl'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
+        <a href="https://arxiv.org/abs/2304.14178"><img src="https://img.shields.io/badge/Arxiv-2304.14178-red"></a>
+        <a href='https://github.com/X-PLUG/mPLUG-Owl/stargazers'><img src='https://img.shields.io/github/stars/X-PLUG/mPLUG-Owl.svg?style=social'></a>
+    </div>
+</div>
+
+**Notice**: The output is generated by top-k sampling scheme and may involve some randomness. For multiple images and video, we cannot ensure it's performance since only image-text pairs are used during training. For Video inputs, we recommand use the video **less than 10 seconds**.
+""")
+
+tos_markdown = ("""
+### Terms of use
+By using this service, users are required to agree to the following terms:
+The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
+Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
+For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
+
+**Copyright 2023 Alibaba DAMO Academy.**
+""")
+
+learn_more_markdown = ("""
+### License
+The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
+""")
+
+css = code_highlight_css + """
+pre {
+    white-space: pre-wrap;       /* Since CSS 2.1 */
+    white-space: -moz-pre-wrap;  /* Mozilla, since 1999 */
+    white-space: -pre-wrap;      /* Opera 4-6 */
+    white-space: -o-pre-wrap;    /* Opera 7 */
+    word-wrap: break-word;       /* Internet Explorer 5.5+ */
+}
+"""
+
+def build_demo():
+    # with gr.Blocks(title="mPLUG-Owl🦉", theme=gr.themes.Base(), css=css) as demo:
+    with gr.Blocks(title="mPLUG-Owl🦉", css=css) as demo:
+        state = gr.State()
+        gr.Markdown(SHARED_UI_WARNING)
+
+        gr.Markdown(title_markdown)
+
+        with gr.Row():
+            with gr.Column(scale=3):
+
+                imagebox = gr.Image(type="pil")
+                videobox = gr.Video()
+
+                with gr.Accordion("Parameters", open=True, visible=False) as parameter_row:
+                    max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
+                    temperature = gr.Slider(minimum=0, maximum=1, value=1, step=0.1, interactive=True, label="Temperature",)
+                    top_k = gr.Slider(minimum=1, maximum=5, value=3, step=1, interactive=True, label="Top K",)
+                    top_p = gr.Slider(minimum=0, maximum=1, value=0.9, step=0.1, interactive=True, label="Top p",)
+                    length_penalty = gr.Slider(minimum=1, maximum=5, value=1, step=0.1, interactive=True, label="length_penalty",)
+                    num_beams = gr.Slider(minimum=1, maximum=5, value=1, step=1, interactive=True, label="Beam Size",)
+                    no_repeat_ngram_size = gr.Slider(minimum=1, maximum=5, value=2, step=1, interactive=True, label="no_repeat_ngram_size",)
+                    do_sample = gr.Checkbox(interactive=True, value=True, label="do_sample")
+
+                gr.Markdown(tos_markdown)
+
+            with gr.Column(scale=6):
+                chatbot = grChatbot(elem_id="chatbot", visible=False).style(height=1000)
+                with gr.Row():
+                    with gr.Column(scale=8):
+                        textbox = gr.Textbox(show_label=False,
+                            placeholder="Enter text and press ENTER", visible=False).style(container=False)
+                    with gr.Column(scale=1, min_width=60):
+                        submit_btn = gr.Button(value="Submit", visible=False)
+                with gr.Row(visible=False) as button_row:
+                    upvote_btn = gr.Button(value="👍  Upvote", interactive=False)
+                    downvote_btn = gr.Button(value="👎  Downvote", interactive=False)
+                    flag_btn = gr.Button(value="⚠️  Flag", interactive=False)
+                    regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+                    clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
+
+        gr.Examples(examples=[
+            [f"examples/monday.jpg", "Explain why this meme is funny."],
+            [f'examples/rap.jpeg', 'Can you write me a master rap song that rhymes very well based on this image?'],
+            [f'examples/titanic.jpeg', 'What happened at the end of this movie?'],
+            [f'examples/vga.jpeg', 'What is funny about this image? Describe it panel by panel.'],
+            [f'examples/mug_ad.jpeg', 'We design new mugs shown in the image. Can you help us write an advertisement?'],
+            [f'examples/laundry.jpeg', 'Why this happens and how to fix it?'],
+            [f'examples/ca.jpeg', "What do you think about the person's behavior?"],
+            [f'examples/monalisa-fun.jpg', 'Do you know who drew this painting?​'],
+        ], inputs=[imagebox, textbox])
+
+        gr.Markdown(learn_more_markdown)
+        url_params = gr.JSON(visible=False)
+
+        btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
+        parameter_list = [
+            max_output_tokens, temperature, top_k, top_p, 
+            num_beams, no_repeat_ngram_size, length_penalty,
+            do_sample
+        ]
+        upvote_btn.click(upvote_last_response,
+            [state], [textbox, upvote_btn, downvote_btn, flag_btn])
+        downvote_btn.click(downvote_last_response,
+            [state], [textbox, upvote_btn, downvote_btn, flag_btn])
+        flag_btn.click(flag_last_response,
+            [state], [textbox, upvote_btn, downvote_btn, flag_btn])
+        # regenerate_btn.click(regenerate, state,
+        #     [state, chatbot, textbox, imagebox, videobox] + btn_list).then(
+        #     http_bot, [state] + parameter_list,
+        #     [state, chatbot] + btn_list)
+        regenerate_btn.click(regenerate_http_bot, [state] + parameter_list,
+            [state, chatbot, textbox, imagebox, videobox] + btn_list)
+
+        clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, videobox] + btn_list)
+
+        # textbox.submit(add_text, [state, textbox, imagebox, videobox], [state, chatbot, textbox, imagebox, videobox] + btn_list
+        #     ).then(http_bot, [state] + parameter_list,
+        #            [state, chatbot] + btn_list)
+        # submit_btn.click(add_text, [state, textbox, imagebox, videobox], [state, chatbot, textbox, imagebox, videobox] + btn_list
+        #     ).then(http_bot, [state] + parameter_list,
+        #            [state, chatbot] + btn_list)
+
+        textbox.submit(add_text_http_bot, 
+            [state, textbox, imagebox, videobox] + parameter_list, 
+            [state, chatbot, textbox, imagebox, videobox] + btn_list
+        )
+
+        submit_btn.click(add_text_http_bot, 
+            [state, textbox, imagebox, videobox] + parameter_list, 
+            [state, chatbot, textbox, imagebox, videobox] + btn_list
+        )
+
+        demo.load(load_demo, [url_params], [state,
+            chatbot, textbox, submit_btn, button_row, parameter_row],
+            _js=get_window_url_params)
+
+    return demo
+
+if __name__ == "__main__":
+    io = init()
+    cur_dir = os.path.dirname(os.path.abspath(__file__))
+    log_dir = cur_dir[:-9] + "log"
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--debug", action="store_true", help="using debug mode")
+    parser.add_argument("--port", type=int)
+    parser.add_argument("--concurrency-count", type=int, default=100)
+    parser.add_argument("--base-model",type=str, default='MAGAer13/mplug-owl-llama-7b')
+    parser.add_argument("--load-8bit", action="store_true", help="using 8bit mode")
+    parser.add_argument("--bf16", action="store_true", help="using 8bit mode")
+    args = parser.parse_args()
+
+    if torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    model = mPLUG_Owl_Server(
+        base_model=args.base_model,
+        log_dir=log_dir,
+        load_in_8bit=args.load_8bit,
+        bf16=args.bf16,
+        device=device,
+        io=io
+    )
+    demo = build_demo()
+    demo.queue(concurrency_count=args.concurrency_count, status_update_rate=10, api_open=False).launch(server_name=args.host, debug=args.debug, server_port=args.port, share=False)
+