feat: add mmkd

2025-06-24 19:47:16 +08:00
parent 0165f28f3f
commit b91ea7f4a0
3 changed files with 257 additions and 0 deletions
--- a/configs/mmkd_black_box_api.json
+++ b/configs/mmkd_black_box_api.json
@@ -0,0 +1,30 @@
 {
  "job_type": "mmkd_black_box_api",
  "dataset": {
    "instruction_path": "train.json",
    "labeled_path": "train_labeled.json",
    "seed": 42
  },
  "inference":{
    "base_url": "ENDPOINT",
    "api_key": "TOKEN",
    "system_prompt" : "You are a helpful assistant.",
    "max_new_tokens": 512
  },
  "models": {
    "student": "student/Qwen/Qwen2.5-VL-3B-Instruct/"
  },
  "training": {
    "output_dir": "./result/",
    "num_train_epochs": 3,
    "per_device_train_batch_size": 1,
    "gradient_accumulation_steps": 8,
    "max_length": 512,
    "save_steps": 1000,
    "logging_steps": 1,
    "learning_rate": 2e-5,
    "weight_decay": 0.05,
    "warmup_ratio": 0.1,
    "lr_scheduler_type": "cosine"
  }
 }
--- a/easydistill/mmkd/infer.py
+++ b/easydistill/mmkd/infer.py
@@ -0,0 +1,122 @@
 # Copyright 2024 Alibaba Group Holding Limited. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 import json
 import argparse
 import logging
 from tqdm import tqdm
 from openai import OpenAI
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 def read_json_field(filename):
    try:
        with open(filename, 'r') as file:
            data = json.load(file)
        outputs = []
        for item in data:
            text = item["instruction"]
            image = item["image"]
            outputs.append((text, image))
        return outputs
    except FileNotFoundError:
        logging.error("The file was not found.")
    except json.JSONDecodeError:
        logging.error("There was an error decoding the JSON file.")
    except Exception as e:
        logging.error(f"An error occurred: {e}")
 def write_data_to_json_file(data, file_path):
    try:
        with open(file_path, 'w') as file:
            json.dump(data, file, ensure_ascii=False, indent=4)
        logging.info(f"Data successfully written to {file_path}")
    except Exception as e:
        logging.error(f"An error occurred: {e}")
 def generate_teacher_response_api(data_list, config):
    client = OpenAI(
        api_key = config["inference"]["api_key"],
        base_url = config["inference"]["base_url"]
    )
    models = client.models.list()
    model = models.data[0].id
    logging.info(model)
    system_prompt = config["inference"]["system_prompt"]
    if system_prompt == "":
        system_prompt = "You are a helpful assistant."
    outcomes = []
    for text, image in tqdm(data_list, desc="Call remote model and generating responses"):
        messages = [
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image
                        },
                    },
                    {
                        "type": "text",
                        "text": text
                    }
                ]
            }
        ]
        completion = client.chat.completions.create(
            messages = messages,
            model = model,
            max_completion_tokens = config["inference"]["max_new_tokens"]
        )
        result = completion.choices[0].message.content
        outcomes.append({'instruction': text, 'image': image, 'output': result})
    write_data_to_json_file(outcomes, config["dataset"]["labeled_path"])
 def infer_with_teacher_model(config):
    logging.info('Generating distillation data from the teacher model!')
    data_list = read_json_field(config["dataset"]["instruction_path"])
    try:
        job_type =  config["job_type"]
        if job_type == "mmkd_black_box_api":
            generate_teacher_response_api(data_list, config)
        else:
            logging.error(f"Invalid job type: {job_type}")
            raise ValueError(f"Invalid job type: {job_type}")
    except ValueError as e:
        logging.error(f"Training job terminated: {e}")
        return
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, required=True, help='path to the json config file')
    args = parser.parse_args()
    config = json.load(open(args.config))
    infer_with_teacher_model(config)
 if __name__ == "__main__":
    main()
--- a/easydistill/mmkd/train.py
+++ b/easydistill/mmkd/train.py
@@ -0,0 +1,105 @@
 # Copyright 2024 Alibaba Group Holding Limited. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 import json
 import argparse
 import logging
 from datasets import load_dataset, Dataset
 from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
 from qwen_vl_utils import process_vision_info
 from trl import SFTTrainer, SFTConfig
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 def train(config):
    dataset = load_dataset("json", data_files=config["dataset"]["labeled_path"])
    dataset = dataset.shuffle(seed=config["dataset"]["seed"])["train"]
    student_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        config["models"]["student"],
        trust_remote_code=True
    )
    processor = Qwen2_5_VLProcessor.from_pretrained(config["models"]["student"])
    def collate_fn(examples):
        texts = []
        images = []
        for example in examples:
            chat = [
                {
                    "role": "user", 
                    "content": [
                        {
                            "type": "image","image": example["image"]
                        }, 
                        {
                            "type": "text","text": example["instruction"]
                        }
                    ]
                },
                {
                    "role": "assistant", 
                    "content": example["output"]
                }
            ]
            text = processor.apply_chat_template(chat, tokenize=False)
            texts.append(text)
            image, _ = process_vision_info(chat)
            images.append(image)
        batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
        labels = batch["input_ids"].clone()
        labels[labels == processor.tokenizer.pad_token_id] = -100
        if isinstance(processor, Qwen2_5_VLProcessor):
            image_tokens = [151652, 151653, 151655]
        else:
            image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]
        for image_token_id in image_tokens:
            labels[labels == image_token_id] = -100
        batch["labels"] = labels
        return batch
    training_arguments = SFTConfig(**config["training"])
    training_arguments.gradient_checkpointing_kwargs = dict(use_reentrant=False)
    training_arguments.remove_unused_columns = False
    training_arguments.dataset_kwargs = {"skip_prepare_dataset": True}
    trainer = SFTTrainer(
        model=student_model,
        data_collator=collate_fn,
        processing_class=processor.tokenizer,
        args=training_arguments,
        train_dataset=dataset
    )
    trainer.train()
    trainer.save_model(config["training"]["output_dir"])
    processor.tokenizer.save_pretrained(config["training"]["output_dir"])
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, required=True, help='path to the json config file')
    args = parser.parse_args()
    config = json.load(open(args.config))
    train(config)
 if __name__ == "__main__":
    main()