update unstructured prompt

update training
training with hybrid loss from teacher and from sft
2025-09-01 09:35:06 +00:00 · 2025-09-01 09:33:16 +00:00 · 2025-08-26 12:57:15 +00:00 · 2025-08-26 09:50:40 +00:00 · 2025-08-25 16:56:09 +00:00 · 2025-08-25 07:03:17 +00:00
18 changed files with 3228454 additions and 51 deletions
--- a/data/vq_multi_turn_nolabel_psycho.json
+++ b/data/vq_multi_turn_nolabel_psycho.json
--- a/data/vq_nolabel_psycho.json
+++ b/data/vq_nolabel_psycho.json
--- a/data/vqa_label.json
+++ b/data/vqa_label.json
--- a/data/vqa_multi_turn_label.json
+++ b/data/vqa_multi_turn_label.json
--- a/easydistill/mmkd/create_vqa_pairs.py
+++ b/easydistill/mmkd/create_vqa_pairs.py
@@ -0,0 +1,221 @@
 import json
 import numpy as np
 import argparse
 import os
 import glob
 from pathlib import Path
 from collections import defaultdict
 def load_json(filepath):
    if not filepath or not os.path.exists(filepath):
        print(f"Info: File label file not found. Prepare question only.")
        return None
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            return json.load(f)
    except json.JSONDecodeError as e:
        print(f"Error: The file at {filepath} is not a valid JSON file. Details: {e}")
        return None
 def read_text_file(filepath):
    """Loads a prompt from a text file."""
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            return f.read().strip()
    except FileNotFoundError:
        print(f"Error: The file {filepath} was not found.")
        return None
 def build_user_prompt(template, json_schema, language):
    """
    Constructs the user prompt by selecting a random question and injecting
    the appropriate JSON sub-schema.
    """
    # 1. Select a random natural language question from the template
    user_question_template = np.random.choice(template["prompts"][language])
    # 2. Build the sub-schema based on the template's target_keys
    sub_schema_properties = {
        key: json_schema["properties"][key]
        for key in template["target_keys"]
        if key in json_schema.get("properties", {})
    }
    sub_schema = {"type": "object", "properties": sub_schema_properties}
    sub_schema_string = json.dumps(sub_schema, indent=4)
    # 3. Combine them into the final prompt
    return f"""{user_question_template}
 Strictly return a valid JSON following this schema:
 **Json schema**
 {sub_schema_string}
 """
 def prepare_vqa(
    label_json_path: str,
    prompt_template_path: str,
    system_prompt_path: str,
    json_schema_path: str,
    media_dir: str,
    output_vqa_json_path: str,
    num_random_templates: int, # New argument to control sampling
 ):
    # Load all configuration files ---
    label_data = load_json(label_json_path)
    prompt_templates = load_json(prompt_template_path)
    system_prompt = read_text_file(system_prompt_path)
    json_schema = load_json(json_schema_path)
    if not prompt_templates or not system_prompt or not json_schema:
        print("Error: Could not load required prompt templates, system prompt, or JSON schema. Exiting.")
        return
    # Separate the 'full' template from the others ---
    full_template = None
    other_templates = []
    for t in prompt_templates.get("templates", []):
        if t.get("group_name") == "full_invoice_extraction":
            full_template = t
        else:
            other_templates.append(t)
    if not full_template:
        print("Warning: 'full_invoice_extraction' template not found. Proceeding with random templates only.")
    final_conversations = []
    # Conditional Logic: Check if we are in labeled or unlabeled mode ---
    if label_data:
        # --- SCENARIO A: LABELED DATA ---
        print("Mode: Generating VQA from ground-truth labels.")
        for label_entry in label_data:
            image_prefix = label_entry.get("image")
            ground_truth_data = label_entry.get("label") # Can be a dict or a list of dicts
            if not image_prefix or not ground_truth_data:
                continue
            # Find all pages associated with the image prefix
            search_pattern = os.path.join(media_dir, f"{Path(image_prefix).stem}*")
            image_paths = sorted(glob.glob(search_pattern))
            if not image_paths:
                continue
            image_contents = [{"type": "image", "image": path} for path in image_paths]
            # Build the list of templates to use for this document
            templates_to_use = []
            if full_template:
                templates_to_use.append(full_template)
            num_to_sample = min(num_random_templates, len(other_templates))
            if num_to_sample > 0:
                templates_to_use.extend(np.random.choice(other_templates, size=num_to_sample, replace=False).tolist())
            # Generate a conversation for each selected template
            for template in templates_to_use:
                language = np.random.choice(list(template["prompts"].keys()))
                user_question = build_user_prompt(template, json_schema, language)
                system_message = {"role": "system", "content": system_prompt}
                user_message = {
                    "role": "user",
                    "content": image_contents + [{"type": "text", "text": "<image>" * len(image_contents) + user_question}],
                }
                # --- MODIFICATION IS HERE ---
                # This block now handles both single (dict) and multiple (list) invoices.
                assistant_content_string = ""
                if isinstance(ground_truth_data, dict):
                    # Case 1: Single invoice. Create a single JSON object.
                    assistant_label = {key: ground_truth_data.get(key) for key in template["target_keys"]}
                    assistant_content_string = json.dumps(assistant_label, indent=4)
                elif isinstance(ground_truth_data, list):
                    # Case 2: Multiple invoices. Create a list of JSON objects.
                    assistant_labels_list = []
                    for invoice_dict in ground_truth_data:
                        if isinstance(invoice_dict, dict):
                            sub_label = {key: invoice_dict.get(key) for key in template["target_keys"]}
                            assistant_labels_list.append(sub_label)
                    # The final output is a string representation of the list of objects
                    assistant_content_string = json.dumps(assistant_labels_list, indent=4)
                if not assistant_content_string:
                    continue # Skip if the label format was invalid
                assistant_message = {
                    "role": "assistant_gt",
                    "content": [{"type": "text", "text": assistant_content_string}],
                }
                final_conversations.append([system_message, user_message, assistant_message])
    else:
        # --- SCENARIO B: UNLABELED DATA ---
        print("Mode: Generating question-only VQA from image directory.")
        all_images = glob.glob(os.path.join(media_dir, "*.[jp][pn]g"))
        documents = defaultdict(list)
        for img_path in all_images:
            stem = Path(img_path).stem
            prefix = stem.rsplit('_', 1)[0] if '_' in stem and stem.rsplit('_', 1)[1].isdigit() else stem
            documents[prefix].append(img_path)
        for doc_prefix, image_paths in documents.items():
            image_contents = [{"type": "image", "image": path} for path in sorted(image_paths)]
            # --- Build the list of templates to use for this document ---
            templates_to_use = []
            if full_template:
                templates_to_use.append(full_template)
            num_to_sample = min(num_random_templates, len(other_templates))
            if num_to_sample > 0:
                templates_to_use.extend(np.random.choice(other_templates, size=num_to_sample, replace=False).tolist())
            # Generate a conversation for each selected template
            for template in templates_to_use:
                language = np.random.choice(list(template["prompts"].keys()))
                user_question = build_user_prompt(template, json_schema, language)
                system_message = {"role": "system", "content": system_prompt}
                user_message = {
                    "role": "user",
                    "content": image_contents + [{"type": "text", "text": "<image>" * len(image_contents) + user_question}],
                }
                final_conversations.append([system_message, user_message])
    # Save the final output ---
    with open(output_vqa_json_path, "w", encoding="utf-8") as output_file:
        json.dump(final_conversations, output_file, indent=4)
    print(f"\nSuccess! Generated {len(final_conversations)} conversations.")
    print(f"Output saved to: {output_vqa_json_path}")
 # --- Main execution ---
 if __name__ == "__main__":
    argparser = argparse.ArgumentParser()
    argparser.add_argument("--media_dir", type=str, required=True)
    argparser.add_argument("--prompt_template_path", type=str, required=True)
    argparser.add_argument("--system_prompt_path", type=str, required=True)
    argparser.add_argument("--json_schema_path", type=str, required=True)
    argparser.add_argument("--output_vqa_json_path", type=str, required=True)
    argparser.add_argument("--label_json_path", type=str, default=None)
    argparser.add_argument(
        "--num_random_templates",
        type=int,
        default=9,
        help="Number of random templates to select in addition to the 'full_invoice_extraction' one."
    )
    args = argparser.parse_args()
    prepare_vqa(
        args.label_json_path,
        args.prompt_template_path,
        args.system_prompt_path,
        args.json_schema_path,
        args.media_dir,
        args.output_vqa_json_path,
        args.num_random_templates,
    )
--- a/easydistill/mmkd/dev-vqa/gen_vqa_bank.py
+++ b/easydistill/mmkd/dev-vqa/gen_vqa_bank.py
@@ -191,11 +191,11 @@ def generate_vqa_conversations(
                "role": "user",
                # The content is the list of image dicts, followed by the text dict
                "content": image_content_list
-                + [{"type": "text", "text": "<image>" + question_text}],
+                + [{"type": "text", "text": "<image>" * len(found_image_paths) + question_text}],
            }
-            assistant_message = {"role": "assistant", "content": answer_text}
+            assistant_message = {"role": "assistant_gt", "content": answer_text}    #[{"type": "text", "text": answer_text}]
-
+            
            conversation = [system_message, user_message, assistant_message]
            final_conversations.append(conversation)
@@ -206,7 +206,110 @@ def generate_vqa_conversations(
    print(f"Success! Generated {len(final_conversations)} conversational VQA entries.")
    print(f"Formatted data saved to: {output_path}")
 # --- Conversations Generation for Multi-Turn Dialogues ---
 def generate_multiturn_conversations(
    labels_path,
    image_root,
    system_prompt_path,
    questions_path,
    answers_path,
    output_path,
 ):
    """
    Generates multi-turn conversational VQA pairs based on predefined field groups.
    """
    all_data_entries = load_json(labels_path)
    system_prompt = read_text_file(system_prompt_path)
    question_bank = load_json(questions_path)
    answer_bank = load_json(answers_path)
    if (
        not all_data_entries
        or not system_prompt
        or not question_bank
        or not answer_bank
    ):
        print("Could not load one or more necessary files. Exiting.")
        return
    # --- MODIFICATION: Define the field groupings for multi-turn conversations ---
    CONVERSATION_GROUPS = {
        "doctor_name": ["profession", "finess_number", "rpps_number", "adeli_number"],
        "beneficiary_name": ["beneficiary_dob", "security_number"],
        "bill_paid": ["mandatory_coverage", "complementary_coverage", "client_part", "amount_paid"],
    }
    final_conversations = []
    for entry in all_data_entries:
        label_data = entry.get("label")
        image_filename_prefix = entry.get("image")
        if not label_data or not image_filename_prefix:
            continue
        # Find all image files associated with this entry
        prefix_stem = Path(image_filename_prefix).stem
        search_pattern = os.path.join(image_root, f"{prefix_stem}*")
        found_image_paths = sorted(glob.glob(search_pattern))
        if not found_image_paths:
            continue
        image_content_list = [
            {"type": "image", "image": path} for path in found_image_paths
        ]
        # --- Create a multi-turn conversation for each group ---
        for main_field, related_fields in CONVERSATION_GROUPS.items():
            # Start a conversation only if the main field exists in the label
            if main_field not in label_data:
                continue
            conversation = []
            language = random.choice(["english", "french"])
            # 1. Add the System Prompt
            conversation.append({"role": "system", "content": system_prompt})
            # 2. First User Turn (with image)
            first_question = random.choice(question_bank[main_field][language])
            conversation.append({
                "role": "user",
                "content": image_content_list + [{"type": "text", "text": "<image>" * len(found_image_paths) + first_question}],
            })
            # 3. First Assistant Turn
            first_answer = get_conversational_answer(
                main_field, label_data, answer_bank, language
            )
            conversation.append({"role": "assistant_gt", "content": first_answer})
            # 4. Follow-up Turns for related fields
            for follow_up_field in related_fields:
                if follow_up_field in label_data:
                    # Follow-up User Turn (text only)
                    follow_up_question = random.choice(question_bank[follow_up_field][language])
                    conversation.append({
                        "role": "user",
                        "content": [{"type": "text", "text": follow_up_question}],
                    })
                    # Follow-up Assistant Turn
                    follow_up_answer = get_conversational_answer(
                        follow_up_field, label_data, answer_bank, language
                    )
                    conversation.append({"role": "assistant_gt", "content": follow_up_answer})
            final_conversations.append(conversation)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(final_conversations, f, indent=4, ensure_ascii=False)
    print(f"Success! Generated {len(final_conversations)} multi-turn VQA conversations.")
    print(f"Formatted data saved to: {output_path}")
 # --- Conversations Generation for only Images ---
 def generate_vq_question(
    image_root, system_prompt_path, questions_path, output_path, ratio=0.4
@@ -260,7 +363,7 @@ def generate_vq_question(
            user_message = {
                "role": "user",
                "content": image_content_list
-                + [{"type": "text", "text": "<image>" + question_text}],
+                + [{"type": "text", "text": "<image>" * len(image_paths) + question_text}],
            }
            conversation = [system_message, user_message]
            final_conversations.append(conversation)
@@ -273,44 +376,102 @@ def generate_vq_question(
    )
    print(f"Formatted data saved to: {output_path}")
 # --- Conversations Generation for Multi-Turn Questions (No Labels) ---
 def generate_multiturn_vq_question(
    image_root, system_prompt_path, questions_path, output_path
 ):
    """
    Generates multi-turn, question-only conversational prompts for each document.
    """
    system_prompt = read_text_file(system_prompt_path)
    question_bank = load_json(questions_path)
    if not system_prompt or not question_bank:
        print("Could not load one or more necessary files. Exiting.")
        return
    # --- MODIFICATION: Define the same field groupings ---
    CONVERSATION_GROUPS = {
        "doctor_name": ["profession", "finess_number", "rpps_number", "adeli_number"],
        "beneficiary_name": ["beneficiary_dob", "security_number"],
        "bill_paid": ["mandatory_coverage", "complementary_coverage", "client_part", "amount_paid"],
    }
    # Find all images and group by prefix
    all_image_paths = sorted(
        glob.glob(os.path.join(image_root, "*.jpg"))
        + glob.glob(os.path.join(image_root, "*.png"))
        + glob.glob(os.path.join(image_root, "*.jpeg"))
    )
    prefix_to_images = {}
    for path in all_image_paths:
        if not os.path.isfile(path):
            continue
        stem = Path(path).stem
        prefix = re.sub(r"(_\d+(_scale)?)$", "", stem)
        prefix_to_images.setdefault(prefix, []).append(path)
    final_conversations = []
    for prefix, image_paths in prefix_to_images.items():
        image_content_list = [
            {"type": "image", "image": path} for path in sorted(image_paths)
        ]
        # --- Create a multi-turn conversation for each group ---
        for main_field, related_fields in CONVERSATION_GROUPS.items():
            conversation = []
            language = random.choice(["english", "french"])
            # 1. Add the System Prompt
            conversation.append({"role": "system", "content": system_prompt})
            # 2. First User Turn (with image)
            first_question = random.choice(question_bank[main_field][language])
            conversation.append({
                "role": "user",
                "content": image_content_list + [{"type": "text", "text": "<image>" * len(image_paths) + first_question}],
            })
            # 3. Follow-up User Turns (text only)
            for follow_up_field in related_fields:
                if follow_up_field in question_bank:
                    follow_up_question = random.choice(question_bank[follow_up_field][language])
                    conversation.append({
                        "role": "user",
                        "content": [{"type": "text", "text": follow_up_question}],
                    })
            final_conversations.append(conversation)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(final_conversations, f, indent=4, ensure_ascii=False)
    print(f"Success! Generated {len(final_conversations)} multi-turn VQA questions.")
    print(f"Formatted data saved to: {output_path}")
 # --- Main Execution Block ---
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Generate VQA conversations from label data.")
-    parser.add_argument("--image_root", type=str, default="/home/nguyendc/docai_dataset/factures/distill_data/docai_mgp_facture_v2_0", help="Root directory containing images.")
+    parser.add_argument("--image_root", type=str, default="/home/nguyendc/docai_dataset/factures/distill_data/trial_2/psycho_distill_300", help="Root directory containing images.")
-    parser.add_argument("--labels", type=str, default="/home/nguyendc/docai_dataset/factures/distill_data/docai_mgp_facture_v2_0/label_data.json", help="Path to the label data JSON file.")
+    parser.add_argument("--labels", type=str, default="/home/nguyendc/docai_dataset/factures/distill_data/trial_2/docai_mgp_facture_v2_0_400/label_data.json", help="Path to the label data JSON file.")
-    parser.add_argument("--system_prompt", type=str, default="/home/nguyendc/phong-dev/distillation/easydistill/mmkd/dev-vqa/qa_bank/unstructured_prompt.txt", help="Path to the system prompt text file.")
+    parser.add_argument("--system_prompt", type=str, default="./dev-vqa/qa_bank/unstructured_prompt.txt", help="Path to the system prompt text file.")
-    parser.add_argument("--questions", type=str, default="/home/nguyendc/phong-dev/distill/prompt/question_bank.json", help="Path to the question bank JSON file.")
+    parser.add_argument("--questions", type=str, default="./dev-vqa/qa_bank/question_bank.json", help="Path to the question bank JSON file.")
-    parser.add_argument("--answers", type=str, default="/home/nguyendc/phong-dev/distill/prompt/answer_bank.json", help="Path to the answer bank JSON file.")
+    parser.add_argument("--answers", type=str, default="./dev-vqa/qa_bank/answer_bank.json", help="Path to the answer bank JSON file.")
-    parser.add_argument("--output", type=str, default="/home/nguyendc/phong-dev/distill/vqa_label.json", help="Path to save the output VQA conversations JSON file.")
+    parser.add_argument("--output", type=str, default="./data/psycho_distill_300_vq_1_turn.json", help="Path to save the output VQA conversations JSON file.")
    parser.add_argument("--ratio", type=float, default=0.4, help="Ratio of fields to sample for questions (default: 0.4).")
    args = parser.parse_args()
    # Define file paths
    # IMAGE_ROOT = "/home/nguyendc/docai_dataset/factures/distill_data/lentille_distill_part_1_15"
    # LABELS_FILE = os.path.join(IMAGE_ROOT, "label_data.json")
    # UNSTRUCTURED_PROMPT_FILE = "/home/nguyendc/phong-dev/distillation/easydistill/mmkd/dev-vqa/qa_bank/unstructured_prompt.txt"
    # QUESTION_BANK_FILE = "/home/nguyendc/phong-dev/distill/prompt/question_bank.json"
    # ANSWER_BANK_FILE = "/home/nguyendc/phong-dev/distill/prompt/answer_bank.json"
    # OUTPUT_FILE = "/home/nguyendc/phong-dev/distill/vqa_label_lentille.json"
    # QUESTION_RATIO = 0.4
-    # Run the main generation function
+    # Single-turn, field-by-field conversations WITH labels
    generate_vqa_conversations(args.labels, args.image_root, args.system_prompt, args.questions, args.answers, args.output, args.ratio)
-    # generate_vqa_conversations(
+    
-    #     LABELS_FILE,
+    # Use this for multi-turn conversations WITH labels based on field groups
-    #     IMAGE_ROOT,
+    # generate_multiturn_conversations(args.labels, args.image_root, args.system_prompt, args.questions, args.answers, args.output)
-    #     UNSTRUCTURED_PROMPT_FILE,
+    
-    #     QUESTION_BANK_FILE,
+    # Use this for generating question-only prompts for unlabeled images
-    #     ANSWER_BANK_FILE,
+    # generate_vq_question(args.image_root, args.system_prompt, args.questions, args.output, args.ratio)
-    #     OUTPUT_FILE,
+    
-    #     QUESTION_RATIO,
+    # Use this for multi-turn question-only prompts for unlabeled images
-    # )
+    # generate_multiturn_vq_question(args.image_root, args.system_prompt, args.questions, args.output)
    # generate_vq_question(
    #     IMAGE_ROOT,
    #     UNSTRUCTURED_PROMPT_FILE,
    #     QUESTION_BANK_FILE,
    #     OUTPUT_FILE,
    #     QUESTION_RATIO,
    # )
--- a/easydistill/mmkd/dev-vqa/qa_bank/unstructured_prompt.txt
+++ b/easydistill/mmkd/dev-vqa/qa_bank/unstructured_prompt.txt
@@ -1,5 +1,6 @@
-You are an advanced AI agent created by Rizlum AI. Your task is to parse invoices and return only the requested information.
+You are an advanced AI agent created by Rizlum AI. Your primary function is to accurately answer questions based on the content of the document image provided.
-### **General Instructions**
+Instructions
-1. **Extract Only the Specified Fields**: Do not include extra information.
+ - Answer Concisely: Directly and accurately answer the user's question.
-2. **Do Not Guess or hallucinate if information is missing or represented by placeholders (e.g., dots, dashes).**
+ - Image Grounding: Your answer must be based only on the information visible in the image. Do not infer, guess, or use outside knowledge.
 - Handle Missing Information: If the information requested in the question is not present in the document, state that clearly. For example, say 'The information is not found on the document' or a similar phrase.
--- a/easydistill/mmkd/dev-vqa/vqa_label.json
+++ b/easydistill/mmkd/dev-vqa/vqa_label.json
--- a/easydistill/mmkd/dev-vqa/vqa_multi_turn_label.json
+++ b/easydistill/mmkd/dev-vqa/vqa_multi_turn_label.json
--- a/easydistill/mmkd/dev-vqa/vqa_multi_turn_nolabel.json
+++ b/easydistill/mmkd/dev-vqa/vqa_multi_turn_nolabel.json
--- a/easydistill/mmkd/dev-vqa/vqa_nolabel.json
+++ b/easydistill/mmkd/dev-vqa/vqa_nolabel.json
--- a/easydistill/mmkd/exporting.py
+++ b/easydistill/mmkd/exporting.py
@@ -0,0 +1,38 @@
 import torch
 from peft import PeftModel
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 # --- 1. Define your model paths ---
 base_model_path = "Qwen/Qwen2.5-VL-3B-Instruct"  # The original student model
 adapter_path = "/home/azureuser/finetuned_models/qwen2.5_vl/lora/Qwen2.5-VL-3B_distill_all_nolabel"                      # The folder where your LoRA adapter was saved
 merged_model_path = "/home/azureuser/finetuned_models/qwen2.5_vl/Qwen2.5-VL-3B_distill_merged_all_nolabel" # Where to save the new, merged model
 print("Loading base model...")
 # --- 2. Load the base model ---
 # Loading on the CPU
 base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    base_model_path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="cpu",
 )
 print("Loading LoRA adapter...")
 # --- 3. Load the LoRA adapter onto the base model ---
 model = PeftModel.from_pretrained(base_model, adapter_path)
 print("Merging adapter into the base model...")
 # --- 4. Merge the weights ---
 # Combines the LoRA weights into the base model's layers.
 model = model.merge_and_unload()
 print(f"Saving merged model to {merged_model_path}...")
 # --- 5. Save the new, standalone model ---
 # The saved model is a standard Hugging Face model.
 model.save_pretrained(merged_model_path)
 # --- 6. Save the processor for easy use later ---
 processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True)
 processor.save_pretrained(merged_model_path)
 print("Merge complete!")
--- a/easydistill/mmkd/infer_2_custom.py
+++ b/easydistill/mmkd/infer_2_custom.py
@@ -0,0 +1,342 @@
 # Copyright 2024 Alibaba Group Holding Limited. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 import json, jsonlines
 import math
 import argparse
 import logging
 from tqdm import tqdm
 from openai import OpenAI
 import torch
 from transformers import AutoProcessor, AutoTokenizer
 from vllm import LLM, SamplingParams
 from qwen_vl_utils import process_vision_info
 import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 def read_json_field(filename):
    try:
        with open(filename, "r") as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        logging.error("The file was not found.")
    except json.JSONDecodeError:
        logging.error("There was an error decoding the JSON file.")
    except Exception as e:
        logging.error(f"An error occurred: {e}")
 def write_data_to_json_file(data, file_path):
    try:
        with open(file_path, "w") as file:
            json.dump(data, file, ensure_ascii=False, indent=4)
        logging.info(f"Data successfully written to {file_path}")
    except Exception as e:
        logging.error(f"An error occurred: {e}")
 def load_tokenizer_and_vllm(config, eos_token=None):
    model_path = config["models"]["teacher"]
    logging.info(f"Loading processor & vLLM model from {model_path}")
    # 1. Use AutoProcessor, which integrates the tokenizer, image_processor, and video_processor
    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
    # 2. eos / pad token 处理（与官方示例保持一致，不再显式改 pad_token）
    if eos_token:
        eos_token_id = processor.tokenizer.convert_tokens_to_ids(eos_token)
        logging.info(f"eos_token {eos_token} from user input")
    elif (
        hasattr(processor.tokenizer, "eos_token_id")
        and processor.tokenizer.eos_token_id is not None
    ):
        eos_token_id = processor.tokenizer.eos_token_id
        eos_token = processor.tokenizer.convert_ids_to_tokens(eos_token_id)
        logging.info(f"Initial eos_token_id {eos_token_id} from tokenizer")
    else:
        raise ValueError("No available eos_token or eos_token_id.")
    # 3. 设置 tokenizer 的 eos 相关字段（pad_token 保持 None，由 vLLM 自动处理）
    try:
        processor.tokenizer.eos_token = eos_token
        processor.tokenizer.eos_token_id = eos_token_id
    except Exception as e:
        logging.warning(f"[WARNING] Cannot set eos_token: {e}")
    logging.info(
        f"processor.tokenizer eos_token: {processor.tokenizer.eos_token}, "
        f"eos_token_id: {processor.tokenizer.eos_token_id}"
    )
    num_gpus = torch.cuda.device_count()
    llm = LLM(
        model=model_path,
        tensor_parallel_size=num_gpus,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 10, "video": 10},  # 可按需调整
        # 其余超参沿用原 config
        gpu_memory_utilization=config["inference"].get("gpu_memory_utilization", 0.99),
        max_model_len=config["inference"].get("max_model_len", 4096),
        enforce_eager=config["inference"].get("enforce_eager", False),
    )
    logging.info("Qwen2.5-VL vLLM model loaded successfully")
    # return processor, llm
    return processor, llm
 def generate_teacher_response_batch(processor, llm, data_list, config, batch_size=1):
    # NOTE: This turn-by-turn generation is complex and works best with a batch size of 1.
    final_conversations = []
    # This version does not need logits, so the sampling params are simpler.
    sampling_params = SamplingParams(
        n=1,
        temperature=config["inference"]["temperature"],
        seed=config["inference"]["seed"],
        max_tokens=config["inference"]["max_new_tokens"],
    )
    for sample in tqdm(data_list, desc="Generating turn-by-turn conversations"):
        try:
            current_conversation = []
            # --- This is the same multi-turn logic as the logits function ---
            for i, message in enumerate(sample):
                current_conversation.append(message)
                # If the current message is from the user, generate a response
                if message.get("role") == "user":
                    # The prompt is the entire conversation up to this point
                    prompt_text = processor.apply_chat_template(
                        current_conversation,
                        tokenize=False,
                        add_generation_prompt=True,
                    )
                    image_inputs, _ = process_vision_info(current_conversation)
                    mm_data = {"image": image_inputs} if image_inputs else {}
                    # Generate the next assistant response
                    outputs = llm.generate(
                        [{"prompt": prompt_text, "multi_modal_data": mm_data}],
                        sampling_params=sampling_params,
                    )
                    generated_text = outputs[0].outputs[0].text
                    # Add the newly generated assistant message to the conversation
                    assistant_message = {
                        "role": "assistant",
                        "content": [{"type": "text", "text": generated_text}],
                    }
                    current_conversation.append(assistant_message)
            # After processing all turns, save the final conversation
            final_conversations.append(current_conversation)
        except Exception as e:
            logging.error(f"An error occurred processing a sample: {e}")
            continue
    # Save the final, fully completed conversational data
    # write_data_to_json_file(final_conversations, config["dataset"]["labeled_path"])
    return final_conversations
 def generate_teacher_logits_batch(processor, llm, data_list, config, batch_size=1):
    # NOTE: This turn-by-turn generation is complex and works best with a batch size of 1.
    final_conversations = []
    final_logits = []
    sampling_params = SamplingParams(
        n=1,
        temperature=config["inference"]["temperature"],
        seed=config["inference"]["seed"],
        max_tokens=config["inference"]["max_new_tokens"],
        # logprobs=config["inference"]["top_logits_num"],
        output_logits=True,
    )
    for sample in data_list:
        # tqdm(data_list, desc="Generating turn-by-turn conversations"):
        try:
            current_conversation = []
            current_logits_sequence = []
            # --- MODIFICATION: Loop through each message to build the conversation turn by turn ---
            for i, message in enumerate(sample):
                current_conversation.append(message)
                # If the current message is from the user, generate a response
                if message.get("role") == "user":
                    # The prompt is the entire conversation up to this point
                    prompt_text = processor.apply_chat_template(
                        current_conversation,
                        tokenize=False,
                        add_generation_prompt=True,
                    )
                    image_inputs, _ = process_vision_info(current_conversation)
                    mm_data = {"image": image_inputs} if image_inputs else {}
                    # Generate the next assistant response
                    outputs = llm.generate(
                        [{"prompt": prompt_text, "multi_modal_data": mm_data}],
                        sampling_params=sampling_params,
                    )
                    generated_text = outputs[0].outputs[0].text
                    logprobs_for_turn = outputs[0].outputs[0].logits # logits instead of logprobs
                    # Add the newly generated assistant message to the conversation
                    assistant_message = {
                        "role": "assistant",
                        "content": [{"type": "text", "text": generated_text}],
                    }
                    current_conversation.append(assistant_message)
                    # Add the logits for this turn to our sequence
                    if logprobs_for_turn is not None:
                        current_logits_sequence.extend(logits_for_turn.cpu().tolist())
            # After processing all turns, save the final results for this sample
            final_conversations.append(current_conversation)
            final_logits.append(current_logits_sequence)
        except Exception as e:
            logging.error(f"An error occurred processing a sample: {e}")
            continue
    processed_logits = final_logits
    with jsonlines.open(config["dataset"]["logits_path"], mode="w") as writer:
        writer.write_all(processed_logits)
    # Save the final, fully completed conversational data
    # write_data_to_json_file(final_conversations, config["dataset"]["labeled_path"])
    return final_conversations, processed_logits
 def generate_teacher_response_api(data_list, config):
    client = OpenAI(
        api_key=config["inference"]["api_key"], base_url=config["inference"]["base_url"]
    )
    model = client.models.list().data[0].id
    logging.info(f"Using remote model: {model}")
    final_conversations = []
    for sample in data_list:
    # tqdm(
    #     data_list, desc="Calling remote API for multi-turn conversations"
    # ):
        try:
            current_conversation = []
            # Loop through each message to build the conversation turn by turn
            for message in sample:
                current_conversation.append(message)
                # If the current message is from the user, generate a response
                if message.get("role") == "user":
                    # The API expects the full history for context
                    completion = client.chat.completions.create(
                        messages=current_conversation,
                        model=model,
                        max_tokens=config["inference"]["max_new_tokens"],
                    )
                    generated_text = completion.choices[0].message.content
                    # Add the newly generated assistant message
                    assistant_message = {
                        "role": "assistant",
                        "content": generated_text,  # API returns a simple string
                    }
                    current_conversation.append(assistant_message)
            final_conversations.append(current_conversation)
        except Exception as e:
            logging.error(f"An error occurred processing a sample with the API: {e}")
            continue
    write_data_to_json_file(final_conversations, config["dataset"]["labeled_path"])
 def infer_with_teacher_model(config):
    logging.info("Generating distillation data from the teacher model!")
    data_list = read_json_field(config["dataset"]["instruction_path"])
    try:
        job_type = config["job_type"]
        if job_type == "mmkd_black_box_api":
            # API calls don't need a local model.
            generate_teacher_response_api(data_list, config)
        elif job_type in ["mmkd_black_box_local", "mmkd_white_box"]:
            # 1. Load the model and processor a single time at the start.
            processor, llm = load_tokenizer_and_vllm(config)
            if job_type == "mmkd_black_box_local":
                # 2. The function now returns the results.
                final_conversations = generate_teacher_response_batch(
                    processor, llm, data_list, config
                )
                # 3. Save the final results.
                write_data_to_json_file(final_conversations, config["dataset"]["labeled_path"])
            elif job_type == "mmkd_white_box":
                # 2. The function now returns both conversations and logits.
                final_conversations, final_logits = generate_teacher_logits_batch(
                    processor, llm, data_list, config
                )
                # 3. Save both final results files.
                logging.info("Writing all accumulated data to final output files...")
                with jsonlines.open(config["dataset"]["logits_path"], mode='w') as writer:
                    writer.write_all(final_logits)
                write_data_to_json_file(final_conversations, config["dataset"]["labeled_path"])
        else:
            logging.error(f"Invalid job type: {job_type}")
            raise ValueError(f"Invalid job type: {job_type}")
    except ValueError as e:
        logging.error(f"Training job terminated: {e}")
        return
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--config", type=str, required=True, help="path to the json config file"
    )
    args = parser.parse_args()
    config = json.load(open(args.config))
    infer_with_teacher_model(config)
 if __name__ == "__main__":
    main()
--- a/easydistill/mmkd/infer_chunk.py
+++ b/easydistill/mmkd/infer_chunk.py
@@ -0,0 +1,156 @@
 import json, jsonlines
 import math
 import argparse
 import logging
 from tqdm import tqdm
 import torch
 from transformers import AutoProcessor
 from vllm import LLM, SamplingParams
 from qwen_vl_utils import process_vision_info
 import os
 import multiprocessing as mp
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 def read_json_field(filename):
    try:
        with open(filename, "r") as file:
            return json.load(file)
    except Exception as e:
        logging.error(f"An error occurred reading {filename}: {e}")
        return None
 def write_data_to_json_file_append(data, file_path):
    """Appends a list of JSON objects to a file, one object per line."""
    try:
        with open(file_path, "a") as file:
            for item in data:
                file.write(json.dumps(item, ensure_ascii=False) + '\n')
        logging.info(f"Data successfully appended to {file_path}")
    except Exception as e:
        logging.error(f"An error occurred writing to {file_path}: {e}")
 def load_tokenizer_and_vllm(config):
    model_path = config["models"]["teacher"]
    logging.info(f"Loading processor & vLLM model from {model_path}")
    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
    num_gpus = torch.cuda.device_count()
    llm = LLM(
        model=model_path,
        tensor_parallel_size=num_gpus,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 10},
        gpu_memory_utilization=config["inference"].get("gpu_memory_utilization", 0.95),
        max_model_len=config["inference"].get("max_model_len", 4096),
    )
    logging.info("Qwen2.5-VL vLLM model loaded successfully")
    return processor, llm
 def generate_teacher_logits(processor, llm, data_list, config):
    """
    Processes a chunk of data, generating both conversations and logits.
    This function now returns the results instead of writing them.
    """
    final_conversations = []
    final_logits = []
    sampling_params = SamplingParams(
        n=1,
        temperature=config["inference"]["temperature"],
        seed=config["inference"]["seed"],
        max_tokens=config["inference"]["max_new_tokens"],
        logprobs=config["inference"]["top_logits_num"],
    )
    for sample in tqdm(data_list, desc="Processing chunk"):
        try:
            current_conversation = []
            current_logits_sequence = []
            for message in sample:
                current_conversation.append(message)
                if message.get("role") == "user":
                    prompt_text = processor.apply_chat_template(
                        current_conversation,
                        tokenize=False,
                        add_generation_prompt=True,
                    )
                    image_inputs, _ = process_vision_info(current_conversation)
                    mm_data = {"image": image_inputs} if image_inputs else {}
                    outputs = llm.generate(
                        [{"prompt": prompt_text, "multi_modal_data": mm_data}],
                        sampling_params=sampling_params,
                    )
                    generated_text = outputs[0].outputs[0].text
                    logprobs_for_turn = outputs[0].outputs[0].logprobs
                    assistant_message = {
                        "role": "assistant",
                        "content": [{"type": "text", "text": generated_text}],
                    }
                    current_conversation.append(assistant_message)
                    if logprobs_for_turn:
                        current_logits_sequence.extend(logprobs_for_turn)
            final_conversations.append(current_conversation)
            final_logits.append(current_logits_sequence)
        except Exception as e:
            logging.error(f"An error occurred processing a sample: {e}")
            continue
    processed_logits = []
    for logit_sequence in final_logits:
        sequence = []
        if logit_sequence:
            for step in logit_sequence:
                probs = {
                    token_id: math.exp(logprob.logprob)
                    for token_id, logprob in step.items()
                }
                sequence.append(probs)
        processed_logits.append(sequence)
    return final_conversations, processed_logits
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, required=True)
    # arguments to define the data chunk ---
    parser.add_argument("--start_index", type=int, required=True)
    parser.add_argument("--end_index", type=int, required=True)
    args = parser.parse_args()
    config = json.load(open(args.config))
    logging.info(f"Processing chunk from index {args.start_index} to {args.end_index}")
    full_data_list = read_json_field(config["dataset"]["instruction_path"])
    # Slice the data to process only the assigned chunk
    chunk_data_list = full_data_list[args.start_index : args.end_index]
    if not chunk_data_list:
        logging.info("This chunk is empty. Exiting.")
        return
    processor, llm = load_tokenizer_and_vllm(config)
    # Generate the data for the chunk
    final_conversations, final_logits = generate_teacher_logits(
        processor, llm, chunk_data_list, config
    )
    # Append the results to the output files
    write_data_to_json_file_append(final_conversations, config["dataset"]["labeled_path"])
    with jsonlines.open(config["dataset"]["logits_path"], mode='a') as writer:
        writer.write_all(final_logits)
    logging.info(f"Finished processing chunk {args.start_index}-{args.end_index}.")
 if __name__ == "__main__":
    try:
        mp.set_start_method("spawn", force=True)
        logging.info("Multiprocessing start method set to 'spawn'.")
    except RuntimeError:
        # This might happen if it's already set, which is fine.
        pass
    main()
--- a/easydistill/mmkd/prompt_templates.json
+++ b/easydistill/mmkd/prompt_templates.json
@@ -1,5 +1,48 @@
 {
    "templates": [
        {
            "prompts": {
                "en": [
                    "Extract all structured information from the document.",
                    "Provide a complete JSON output of all relevant fields from the invoice.",
                    "Parse the entire document and return all available details.",
                    "Get all invoice details, including provider, patient, and financial information."
                ],
                "fr": [
                    "Extraire toutes les informations structurées du document.",
                    "Fournir une sortie JSON complète de tous les champs pertinents de la facture.",
                    "Analyser l'intégralité du document et retourner tous les détails disponibles.",
                    "Obtenir tous les détails de la facture, y compris les informations sur le prestataire, le patient et les finances."
                ]
            },
            "group_name": "full_invoice_extraction",
            "target_keys": [
                "is_bill",
                "profession",
                "adeli_number",
                "rpps_number",
                "finess_number",
                "doctor_name",
                "total_billed",
                "bill_paid",
                "amount_paid",
                "mandatory_coverage",
                "complementary_coverage",
                "client_part",
                "remaining_payment",
                "insured_name",
                "insured_dob",
                "beneficiary_name",
                "beneficiary_dob",
                "care_start_date",
                "care_end_date",
                "invoice_date",
                "security_number",
                "invoice_issuer",
                "currency",
                "items"
            ]
        },
        {
            "prompts": {
                "en": [
--- a/easydistill/mmkd/runner.py
+++ b/easydistill/mmkd/runner.py
@@ -0,0 +1,75 @@
 import json
 import os
 import subprocess
 import argparse
 from tqdm import tqdm
 def main():
    parser = argparse.ArgumentParser(description="Controller script for running inference in chunks.")
    parser.add_argument("--config", type=str, required=True, help="Path to the main JSON config file.")
    parser.add_argument("--infer_script", type=str, required=True, help="Path to the infer.py worker script.")
    parser.add_argument("--chunk_size", type=int, default=50, help="Number of documents to process in each subprocess.")
    args = parser.parse_args()
    # 1. Load the config to find the instruction path
    config = json.load(open(args.config))
    instruction_path = config["dataset"]["instruction_path"]
    labeled_path = config["dataset"]["labeled_path"]
    logits_path = config["dataset"]["logits_path"]
    # 2. Clear previous output files before starting
    if os.path.exists(labeled_path):
        os.remove(labeled_path)
    if os.path.exists(logits_path):
        os.remove(logits_path)
    print(f"Cleared previous output files: {labeled_path} and {logits_path}")
    # 3. Load the full dataset to get the total count
    with open(instruction_path) as f:
        total_data = json.load(f)
        total_size = len(total_data)
    print(f"Total documents to process: {total_size}")
    # 4. Loop through the data in chunks
    for i in tqdm(range(0, total_size, args.chunk_size), desc="Processing chunks"):
        start_index = i
        end_index = min(i + args.chunk_size, total_size)
        print(f"\n----- Processing chunk: {start_index} to {end_index} -----")
        # 5. Construct the command to call your inference script
        command = [
            "python3",
            args.infer_script,
            "--config", args.config,
            "--start_index", str(start_index),
            "--end_index", str(end_index),
        ]
        # 6. Run the command as a subprocess and wait for it to complete
        try:
            # Using capture_output=True and text=True to see the output
            result = subprocess.run(
                command, 
                check=True, 
                capture_output=True, 
                text=True
            )
            print(result.stdout)
            if result.stderr:
                print("--- Errors from subprocess ---")
                print(result.stderr)
        except subprocess.CalledProcessError as e:
            print(f"!!! FATAL ERROR processing chunk {start_index}-{end_index}. Aborting. !!!")
            print("--- Subprocess stdout ---")
            print(e.stdout)
            print("--- Subprocess stderr ---")
            print(e.stderr)
            break
    print("\n----- All chunks processed successfully! -----")
 if __name__ == "__main__":
    main()
--- a/easydistill/mmkd/train_lora.py
+++ b/easydistill/mmkd/train_lora.py
@@ -30,10 +30,11 @@ from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    AutoConfig
 )
 from qwen_vl_utils import process_vision_info
 from trl import SFTTrainer, SFTConfig
-
+from peft import LoraConfig
 logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
@@ -73,10 +74,6 @@ class DistillSFTTrainer(SFTTrainer):
        self.kd_ratio = kd_ratio
        self.max_seq_length = max_seq_length
        self.distillation_type = distillation_type
        self.teacher_logits = []
        with jsonlines.open(self.logits_dir) as reader:
            for obj in reader:
                self.teacher_logits.append(obj)
    def _load_teacher_logits(
        self,
@@ -88,7 +85,16 @@ class DistillSFTTrainer(SFTTrainer):
    ):
        start_idx = dp_rank * batch_size + batch_size * it
        end_idx = dp_rank * batch_size + batch_size * (it + 1)
-        loaded_data = self.teacher_logits[start_idx:end_idx]
+        
        loaded_data = []
        # Open file and read only the specific lines needed for the current batch
        with jsonlines.open(self.logits_dir) as reader:
            for i, obj in enumerate(reader):
                if i >= start_idx and i < end_idx:
                    loaded_data.append(obj)
                elif i >= end_idx:
                    break
        arr = np.zeros((batch_size, self.max_seq_length, self.teacher_vocab_size))
        for i in range(len(loaded_data)):
            for j in range(len(loaded_data[i])):
@@ -117,6 +123,8 @@ class DistillSFTTrainer(SFTTrainer):
            else torch.ones_like(student_logits[:, :, 0])
        )
        mask = mask[:, : self.max_seq_length]
        if self.distillation_type == "forward_kld":
            # Forward KLD: student learns from teacher (original implementation)
            loss = F.kl_div(
@@ -197,9 +205,23 @@ def train(config):
        raw_data = json.load(f)
    dataset = MMDataset(raw_data)
    student_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        config["models"]["student"], trust_remote_code=True
+        config["models"]["student"],
        torch_dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
        trust_remote_code=True,
        device_map="auto",
    )
    processor = Qwen2_5_VLProcessor.from_pretrained(config["models"]["student"])
    # Creating LoRA configuration
    lora_config = LoraConfig(
        r=16, # Rank of the LoRA layers
        lora_alpha=32, # Scaling factor for the LoRA layers
        lora_dropout=0.1, # Dropout rate for the LoRA layers
        bias="none", # No bias in LoRA layers
        task_type="CAUSAL_LM", # Task type for the LoRA layers
        target_modules=["q_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "o_proj"], # Target modules for LoRA
    )
    training_arguments = SFTConfig(**config["training"])
    training_arguments.gradient_checkpointing_kwargs = dict(use_reentrant=False)
@@ -241,14 +263,18 @@ def train(config):
            trainer = SFTTrainer(
                model=student_model,
                data_collator=collate_fn,
-                processing_class=processor.tokenizer,
+                tokenizer=processor.tokenizer,
                args=training_arguments,
                train_dataset=dataset,
                peft_config=lora_config,
            )
        elif "mmkd_white_box" in job_type:
-            teacher_vocab_size = json.load(
+            teacher_config = AutoConfig.from_pretrained(
-                open(os.path.join(config["models"]["teacher"], "config.json"))
+                config["models"]["teacher"],
-            )["vocab_size"]
+                trust_remote_code=True
            )
            teacher_vocab_size = teacher_config.vocab_size
            trainer = DistillSFTTrainer(
                logits_dir=config["dataset"]["logits_path"],
                data_collator=collate_fn,
@@ -259,7 +285,8 @@ def train(config):
                    "distillation_type", "forward_kld"
                ),
                model=student_model,
-                processing_class=processor.tokenizer,
+                peft_config=lora_config,
                tokenizer=processor.tokenizer,
                args=training_arguments,
                train_dataset=dataset,
            )
--- a/easydistill/mmkd/train_lora_2_custom.py
+++ b/easydistill/mmkd/train_lora_2_custom.py
@@ -0,0 +1,322 @@
 # Copyright 2024 Alibaba Group Holding Limited. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 import json
 import torch
 import numpy as np
 import jsonlines
 import torch.nn.functional as F
 import os
 import argparse
 import logging
 from datasets import load_dataset, Dataset
 from typing import Optional, Dict, Union, List
 from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
 from transformers import (
    PreTrainedModel,
    PreTrainedTokenizerBase,
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    AutoConfig
 )
 from qwen_vl_utils import process_vision_info
 from trl import SFTTrainer, SFTConfig
 from peft import LoraConfig
 logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 from torch.utils.data import Dataset
 from PIL import Image
 import os
 class MMDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[int(idx)]
 class DistillSFTTrainer(SFTTrainer):
    def __init__(
        self,
        logits_dir: str = None,
        teacher_vocab_size=None,
        kd_ratio: float = 0.5,
        max_seq_length: int = 1024,
        distillation_type: str = "forward_kld",
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.logits_dir = logits_dir
        self.teacher_vocab_size = teacher_vocab_size
        self.kd_ratio = kd_ratio
        self.max_seq_length = max_seq_length
        self.distillation_type = distillation_type
    def _load_teacher_logits(
        self,
        batch_size: int,
        it: int,
        dp_rank: int,
        device: torch.device,
        no_model_batch: Dict,
    ):
        start_idx = dp_rank * batch_size + batch_size * it
        end_idx = dp_rank * batch_size + batch_size * (it + 1)
        loaded_data = []
        # Open file and read only the specific lines needed for the current batch
        with jsonlines.open(self.logits_dir) as reader:
            for i, obj in enumerate(reader):
                if i >= start_idx and i < end_idx:
                    loaded_data.append(obj)
                elif i >= end_idx:
                    break
        arr = np.zeros((batch_size, self.max_seq_length, self.teacher_vocab_size))
        for i in range(len(loaded_data)):
            for j in range(len(loaded_data[i])):
                keys = np.array(list(loaded_data[i][j].keys()), dtype=int)
                values = np.array(list(loaded_data[i][j].values()))
                arr[i, j, keys] = values
        logits_tensor = torch.tensor(arr, dtype=torch.bfloat16, device=device)
        return self._shift_tensor_right(
            logits_tensor, no_model_batch["label"], pad_value=0
        )
    def _compute_white_box_distillation_loss(
        self,
        student_logits: torch.Tensor,
        teacher_logits: torch.Tensor,
        labels: Optional[torch.Tensor],
        temperature: float = 1.0,
    ):
        student_logits = student_logits[:, : self.max_seq_length, :]
        teacher_logits = teacher_logits[
            :, : student_logits.size(1), : student_logits.size(-1)
        ]
        mask = (
            (labels != -100).float()
            if labels is not None
            else torch.ones_like(student_logits[:, :, 0])
        )
        mask = mask[:, : self.max_seq_length]
        # Apply temperature scaling
        student_log_probs = F.log_softmax(student_logits / temperature, dim=-1)
        teacher_probs = F.softmax(teacher_logits / temperature, dim=-1)
        if self.distillation_type == "forward_kld":
            # Forward KLD: student learns from teacher (original implementation)
            loss = F.kl_div(
                student_log_probs,
                teacher_probs,
                reduction="none",
                log_target=False,
            ).sum(dim=-1)# / torch.sum(mask.view(-1), dim=0)
        elif self.distillation_type == "reverse_kld":
            # Reverse KLD: teacher provides certainty to student
            loss = F.kl_div(
                torch.log(teacher_probs.clamp(min=1e-10)),  # avoid log(0)
                F.softmax(student_logits / temperature, dim=-1),
                reduction="none",
                log_target=False,
            ).sum(dim=-1)# / torch.sum(mask.view(-1), dim=0)
        else:
            raise ValueError(
                f"Unsupported distillation type: {self.distillation_type}. Use 'forward_kld' or 'reverse_kld'"
            )
        return (loss * mask).sum() / mask.sum() * (temperature ** 2)
    @staticmethod
    def _shift_tensor_right(
        inputs: torch.Tensor, labels: torch.Tensor, pad_value: float = 0.0
    ):
        batch_size, seqlen, vocab_size = inputs.shape
        device = inputs.device
        labels_ne = labels != -100
        shift_distances = torch.argmax(labels_ne.int(), dim=1)
        idx = (
            torch.arange(seqlen, device=device).unsqueeze(0).expand(batch_size, seqlen)
        )
        shifted_idx = idx - shift_distances.unsqueeze(1)
        mask = shifted_idx >= 0
        shifted_idx = shifted_idx.clamp(min=0)
        inputs_flat = inputs.view(batch_size, seqlen, vocab_size)
        shifted_idx = shifted_idx.unsqueeze(2).expand(-1, -1, vocab_size)
        gathered = torch.gather(inputs_flat, 1, shifted_idx)
        mask = mask.unsqueeze(2).expand(-1, -1, vocab_size)
        return torch.where(mask, gathered, torch.full_like(gathered, pad_value))
    def compute_loss(
        self,
        model: PreTrainedModel,
        inputs: Dict[str, torch.Tensor],
        return_outputs=False,
        num_items_in_batch=None,
    ):
        outputs = model(**inputs)
        lm_loss = outputs.loss
        if self.logits_dir:
            teacher_logits = self._load_teacher_logits(
                batch_size=inputs["input_ids"].size(0),
                it=self.state.global_step,
                dp_rank=(
                    torch.distributed.get_rank()
                    if torch.distributed.is_initialized()
                    else 0
                ),
                device=model.device,
                no_model_batch={"label": inputs.get("labels", None)},
            )
            distil_loss = self._compute_white_box_distillation_loss(
                student_logits=outputs.logits,
                teacher_logits=teacher_logits,
                labels=inputs.get("labels", None),
            )
            total_loss = (1 - self.kd_ratio) * lm_loss + self.kd_ratio * distil_loss
        else:
            total_loss = lm_loss
        return (total_loss, outputs) if return_outputs else total_loss
 def train(config):
    with open(config["dataset"]["labeled_path"], "r") as f:
        raw_data = json.load(f)
    dataset = MMDataset(raw_data)
    student_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        config["models"]["student"],
        torch_dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
        trust_remote_code=True,
        device_map="auto",
    )
    processor = Qwen2_5_VLProcessor.from_pretrained(config["models"]["student"])
    # Creating LoRA configuration
    lora_config = LoraConfig(
        r=config["training"]["lora_rank"], # Rank of the LoRA layers
        lora_alpha=config["training"]["lora_alpha"], # Scaling factor for the LoRA layers
        lora_dropout=config["training"]{"lora_dropout"}, # Dropout rate for the LoRA layers
        bias="none", # No bias in LoRA layers
        task_type="CAUSAL_LM", # Task type for the LoRA layers
        target_modules=["q_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "o_proj"], # Target modules for LoRA
    )
    training_arguments = SFTConfig(**config["training"])
    training_arguments.gradient_checkpointing_kwargs = dict(use_reentrant=False)
    training_arguments.remove_unused_columns = False
    training_arguments.dataset_kwargs = {"skip_prepare_dataset": True}
    def collate_fn(examples):
        texts = []
        images = []
        for example in examples:
            chat = example
            text = processor.apply_chat_template(chat, tokenize=False)
            texts.append(text)
            image, _ = process_vision_info(example)
            images.append(image)
        batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
        labels = batch["input_ids"].clone()
        labels[labels == processor.tokenizer.pad_token_id] = -100
        if isinstance(processor, Qwen2_5_VLProcessor):
            image_tokens = [151652, 151653, 151655]
        else:
            image_tokens = [
                processor.tokenizer.convert_tokens_to_ids(processor.image_token)
            ]
        for image_token_id in image_tokens:
            labels[labels == image_token_id] = -100
        batch["labels"] = labels
        return batch
    try:
        job_type = config["job_type"]
        if "mmkd_black_box" in job_type:
            trainer = SFTTrainer(
                model=student_model,
                data_collator=collate_fn,
                # tokenizer=processor.tokenizer,
                args=training_arguments,
                train_dataset=dataset,
                peft_config=lora_config,
            )
        elif "mmkd_white_box" in job_type:
            teacher_config = AutoConfig.from_pretrained(
                config["models"]["teacher"],
                trust_remote_code=True
            )
            teacher_vocab_size = teacher_config.vocab_size
            trainer = DistillSFTTrainer(
                logits_dir=config["dataset"]["logits_path"],
                data_collator=collate_fn,
                teacher_vocab_size=teacher_vocab_size,
                kd_ratio=config["distillation"]["kd_ratio"],
                max_seq_length=config["distillation"]["max_seq_length"],
                distillation_type=config["distillation"].get(
                    "distillation_type", "forward_kld"
                ),
                model=student_model,
                peft_config=lora_config,
                # tokenizer=processor.tokenizer,
                args=training_arguments,
                train_dataset=dataset,
            )
        else:
            logging.error(f"Invalid job type: {job_type}")
            raise ValueError(f"Invalid job type: {job_type}")
    except ValueError as e:
        logging.error(f"Training job terminated: {e}")
        return
    trainer.train()
    trainer.save_model(config["training"]["output_dir"])
    processor.tokenizer.save_pretrained(config["training"]["output_dir"])
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--config", type=str, required=True, help="path to the json config file"
    )
    args = parser.parse_args()
    config = json.load(open(args.config))
    train(config)
 if __name__ == "__main__":
    main()
Author	SHA1	Message	Date
Ubuntu	2fc34e192a	update unstructured prompt	2025-09-01 09:35:06 +00:00
Ubuntu	d3bd2806e8	update training	2025-09-01 09:33:16 +00:00
Ubuntu	a520d9cae5	training with hybrid loss from teacher and from sft	2025-08-26 12:57:15 +00:00
Ubuntu	a12a8714e4	creating vqa pairs with structured data for both labeled and unlabeled data	2025-08-26 09:50:40 +00:00
Ubuntu	1f7fa63676	exporting model after lora	2025-08-25 16:56:09 +00:00
lphatnguyen	75d74fbe70	infer with chunk of 50 data for avoiding OOM	2025-08-25 07:03:17 +00:00
lphatnguyen	4110d9e12a	add lora to training	2025-08-20 10:13:19 +00:00
lphatnguyen	228fa8c81b	add vqa for label and unlabel	2025-08-18 19:49:41 +00:00
lphatnguyen	c35a1621b2	fix mulitple <image>	2025-08-14 08:35:10 +00:00
lphatnguyen	8d781d68df	generate mulitple turn question in conversation	2025-08-13 20:51:49 +00:00