[Init] Init easy distill for Knowledge distillation

2025-08-07 08:38:26 +00:00
parent 2f21aaae17
commit 0637599c3a
19 changed files with 170614 additions and 3 deletions
--- a/easydistill/mmkd/create_question_answering_pairs.py
+++ b/easydistill/mmkd/create_question_answering_pairs.py
@@ -0,0 +1,121 @@
+import json
+import re
+
+
+def load_prompt_templates(filepath):
+    """Loads the prompt templates from a JSON file."""
+    try:
+        with open(filepath, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"Error: The file {filepath} was not found.")
+        return None
+    except json.JSONDecodeError:
+        print(f"Error: The file {filepath} is not a valid JSON file.")
+        return None
+
+
+def get_nested_value(data_dict, key_path):
+    """
+    Retrieves a value from a nested dictionary or list using a string path.
+    Example: "items.description" will extract the description from each item in the list.
+    """
+    # Handle nested keys like 'items.amount'
+    if "." in key_path:
+        main_key, sub_key = key_path.split(".", 1)
+        if main_key in data_dict and isinstance(data_dict[main_key], list):
+            # Extract the sub_key from each object in the list
+            return [
+                item.get(sub_key)
+                for item in data_dict[main_key]
+                if isinstance(item, dict) and sub_key in item
+            ]
+        else:
+            return None
+
+    # Handle simple, top-level keys
+    return data_dict.get(key_path)
+
+
+def get_label_from_prompt(question, data, templates):
+    """
+    Finds a matching prompt (in English or French) and returns a new JSON object
+    containing the related fields defined in the template.
+
+    Args:
+        question (str): The user's question.
+        data (dict): The main JSON data object.
+        templates (dict): The dictionary of prompt templates.
+
+    Returns:
+        A dictionary (JSON object) with the extracted data, or an error object.
+    """
+    if not templates or "templates" not in templates:
+        print("Error: Invalid templates format.")
+        return {"error": "Invalid templates format."}
+
+    # Normalize the input question to lowercase for case-insensitive matching
+    normalized_question = question.lower()
+
+    for template in templates["templates"]:
+        # Get both english and french prompts, defaulting to empty lists if not present
+        en_prompts = [p.lower() for p in template.get("prompts", {}).get("en", [])]
+        fr_prompts = [p.lower() for p in template.get("prompts", {}).get("fr", [])]
+
+        # Check if the user's question matches any of the prompts in either language
+        if normalized_question in en_prompts or normalized_question in fr_prompts:
+            target_keys = template["target_keys"]
+
+            result_object = {}
+            for key in target_keys:
+                value = get_nested_value(data, key)
+                # If the key was nested (e.g., 'items.amount'), the key in the result should be the sub-key
+                simple_key = key.split(".")[-1]
+                result_object[simple_key] = value
+
+            return result_object
+
+    return {"error": "No matching prompt found."}
+
+
+# --- Main execution ---
+if __name__ == "__main__":
+    label_data = json.load(
+        open(
+            "/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_1/label_data.json"
+        )
+    )
+    # 1. Load the templates
+    prompt_templates = load_prompt_templates("prompt_templates.json")
+
+    # 2. Define questions to ask in both English and French
+    user_question_en = "Who is the doctor?"
+    user_question_fr = "Aperçu de la facturation"
+    user_question_invalid = "What is the weather?"
+
+    # 3. Get the label (sub-object) from the prompts
+    if prompt_templates:
+        answer_en = get_label_from_prompt(
+            user_question_en, label_data, prompt_templates
+        )
+        answer_fr = get_label_from_prompt(
+            user_question_fr, label_data, prompt_templates
+        )
+        answer_invalid = get_label_from_prompt(
+            user_question_invalid, label_data, prompt_templates
+        )
+
+        print(f"Question (EN): '{user_question_en}'")
+        print("Answer (JSON Object):")
+        print(json.dumps(answer_en, indent=2, ensure_ascii=False))
+        print("-" * 20)
+
+        print(f"Question (FR): '{user_question_fr}'")
+        print("Answer (JSON Object):")
+        print(json.dumps(answer_fr, indent=2, ensure_ascii=False))
+        print("-" * 20)
+
+        print(f"Question (Invalid): '{user_question_invalid}'")
+        print("Answer (JSON Object):")
+        print(json.dumps(answer_invalid, indent=2, ensure_ascii=False))
+        print("-" * 20)
--- a/easydistill/mmkd/create_vqa.py
+++ b/easydistill/mmkd/create_vqa.py
@@ -0,0 +1,53 @@
+import json
+from tqdm import tqdm
+from PIL import Image
+
+def get_total_pixels(image_paths):
+    total_pixels = 0
+    for path in image_paths:
+        try:
+            with Image.open(path) as img:
+                width, height = img.size
+                total_pixels += width * height
+        except Exception as e:
+            print(f"Error processing {path}: {e}")
+    return total_pixels
+
+json_label_path = "/home/nguyendc/model-factory/Finetuning-Automation/etc/data/docai_mgp_facture_v2_1.json"
+
+with open(json_label_path) as file:
+    json_data = json.load(file)
+
+home_dir = "/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/"
+# create VQA using json_data
+vqa = []
+for item in tqdm(json_data):
+    conversations = []
+    system_object = {
+        "role": "system",
+        "content": item["conversations"][0]["value"]
+    }
+    conversations.append(system_object)
+    image_paths = [home_dir + image_path for image_path in item["images"]]
+    # if get_total_pixels(image_paths) > 3200000:
+    #     continue
+    image_contents = [{"type": "image", "image": image_path} for image_path in image_paths]
+    # user content is shown in each odd position
+    for i in range(1, len(item["conversations"])):
+        if i%2 == 1:
+            user_object = {
+                "role": "user",
+                "content": image_contents + [{"type": "text", "text": item["conversations"][i]["value"]}]
+            }
+            conversations.append(user_object)
+        else:
+            assistant_object = {
+                "role": "assistant",
+                "content": item["conversations"][i]["value"]
+            }
+            conversations.append(assistant_object)
+    vqa.append(conversations)
+
+# save vqa to json file
+with open("vqa.json", "w") as file:
+    json.dump(vqa, file,indent=4)
--- a/easydistill/mmkd/dev-vqa/gen_vqa_bank.py
+++ b/easydistill/mmkd/dev-vqa/gen_vqa_bank.py
@@ -0,0 +1,237 @@
+import json
+import os
+import random
+from pathlib import Path
+import glob
+import re
+
+def load_json(filepath):
+    """
+    Loads a JSON file with robust error handling.
+    """
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"Error: The file was not found at {filepath}")
+        return None
+    except json.JSONDecodeError as e:
+        print(f"Error: The file at {filepath} is not a valid JSON file. Details: {e}")
+        return None
+
+def read_text_file(filepath):
+    """
+    Loads a simple text file.
+    """
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            return f.read().strip()
+    except FileNotFoundError:
+        print(f"Error: The file was not found at {filepath}")
+        return None
+
+def format_items_list(items, language):
+    """
+    Formats a list of item dictionaries into a human-readable string.
+    """
+    if not items:
+        return ""
+
+    formatted_lines = []
+    for item in items:
+        if not isinstance(item, dict):
+            continue
+        parts = []
+        desc = item.get("description")
+        if desc is not None:
+            parts.append(f"{desc}")
+        qty = item.get("quantity")
+        if qty is not None:
+            qty_str = "Quantity" if language == "english" else "Quantité"
+            parts.append(f"{qty_str}: {qty}")
+        date = item.get("date_of_service")
+        if date is not None:
+            date_str = "Date" if language == "english" else "Date"
+            parts.append(f"{date_str}: {date}")
+        mandatory = item.get("mandatory_coverage")
+        if mandatory is not None:
+            amo_str = "Mandatory Coverage" if language == "english" else "Couverture obligatoire"
+            parts.append(f"{amo_str}: {mandatory}")
+        amount = item.get("amount")
+        if amount is not None:
+            amount_str = "Amount" if language == "english" else "Montant"
+            parts.append(f"{amount_str}: {amount}")
+        formatted_lines.append("- " + ", ".join(parts))
+    return "\n".join(formatted_lines)
+
+def get_conversational_answer(field, label_data, answer_bank, language):
+    """
+    Generates a complete conversational answer by selecting a template and filling it
+    with the appropriate value from the label data.
+    """
+    value = label_data.get(field)
+    field_templates = answer_bank.get(field)
+
+    if not field_templates:
+        return str(value) if value is not None else ""
+
+    if value is None:
+        return random.choice(field_templates.get("null", {}).get(language, [""]))
+    if field == "items":
+        template = random.choice(field_templates[language])
+        formatted_list_string = format_items_list(value, language)
+        return template.format(value=formatted_list_string)
+    if isinstance(value, bool):
+        bool_key = str(value).lower()
+        if bool_key in field_templates[language]:
+            return random.choice(field_templates[language][bool_key])
+        return str(value)
+    if isinstance(field_templates[language], list):
+        template = random.choice(field_templates[language])
+        return template.format(value=value)
+    return str(value) if value is not None else ""
+
+# --- Conversations Generation for Label Data ---
+def generate_field_level_conversations(labels_path, image_root, system_prompt_path, questions_path, answers_path, output_path):
+    """
+    Generates multiple conversational VQA pairs for each field in a label file,
+    and handles multi-page documents.
+    """
+    all_data_entries = load_json(labels_path)
+    system_prompt = read_text_file(system_prompt_path)
+    question_bank = load_json(questions_path)
+    answer_bank = load_json(answers_path)
+
+    if not all_data_entries or not system_prompt or not question_bank or not answer_bank:
+        print("Could not load one or more necessary files. Exiting.")
+        return
+
+    final_conversations = []
+
+    # Process each entry in the main label file
+    for entry in all_data_entries:
+        label_data = entry.get("label")
+        image_filename_prefix = entry.get("image")
+
+        # Skip entries that are unlabeled, as we need the label to generate Q&A pairs
+        if not label_data or not image_filename_prefix:
+            continue
+
+        # Find all image files in the image_root that start with the prefix.
+        # This handles cases like 'doc-1.jpg', 'doc-2.jpg', 'doc_scale.jpg' etc.
+        prefix_stem = Path(image_filename_prefix).stem
+        search_pattern = os.path.join(image_root, f"{prefix_stem}*")
+        found_image_paths = sorted(glob.glob(search_pattern))
+
+        if not found_image_paths:
+            print(f"Warning: No images found for prefix '{prefix_stem}' in '{image_root}'. Skipping.")
+            continue
+            
+        # Create a list of image dictionaries for the user message
+        image_content_list = [{"type": "image", "image": path} for path in found_image_paths]
+
+        # --- Create a new conversation for EACH field in the label ---
+        for field in label_data:
+            if not isinstance(field, str):
+                continue
+            if field not in question_bank:
+                continue
+
+            language = random.choice(['english', 'french'])
+            
+            # Get the question from the question bank
+            question_text = random.choice(question_bank[field][language])
+            
+            # Get the conversational answer from the answer bank
+            answer_text = get_conversational_answer(field, label_data, answer_bank, language)
+
+            # --- Assemble the conversation in the desired format ---
+            system_message = {
+                "role": "system",
+                "content": system_prompt
+            }
+
+            user_message = {
+                "role": "user",
+                # The content is the list of image dicts, followed by the text dict
+                "content": image_content_list + [{"type": "text", "text": "<image>"+ question_text}]
+            }
+            
+            assistant_message = {
+                "role": "assistant",
+                "content": answer_text
+            }
+            
+            conversation = [system_message, user_message, assistant_message]
+            final_conversations.append(conversation)
+
+    # Save the final list of conversations to the output file
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(final_conversations, f, indent=4, ensure_ascii=False)
+
+    print(f"Success! Generated {len(final_conversations)} conversational VQA entries.")
+    print(f"Formatted data saved to: {output_path}")
+
+# --- Conversations Generation for only Images ---
+def generate_image_only_conversations(image_root, system_prompt_path, questions_path, output_path):
+    """
+    Generates conversational VQA pairs for each document based on images only (no labels).
+    Groups all images with the same prefix (including _1_scale, _2_scale, etc.) into the same conversation.
+    Each conversation contains a system and user message for each question in the question bank.
+    """
+    system_prompt = read_text_file(system_prompt_path)
+    question_bank = load_json(questions_path)
+
+    if not system_prompt or not question_bank:
+        print("Could not load one or more necessary files. Exiting.")
+        return
+
+    # Find all images and group by prefix
+    all_image_paths = sorted(glob.glob(os.path.join(image_root, "*")))
+    prefix_to_images = {}
+    for path in all_image_paths:
+        if not os.path.isfile(path):
+            continue
+        stem = Path(path).stem
+        # Remove suffixes like _1_scale, _2_scale, etc.
+        prefix = re.sub(r'(_\d+(_scale)?)$', '', stem)
+        prefix_to_images.setdefault(prefix, []).append(path)
+
+    final_conversations = []
+
+    for prefix, image_paths in prefix_to_images.items():
+        image_content_list = [{"type": "image", "image": path} for path in sorted(image_paths)]
+        for field, lang_dict in question_bank.items():
+            for language in lang_dict:
+                for question_text in lang_dict[language]:
+                    system_message = {
+                        "role": "system",
+                        "content": system_prompt
+                    }
+                    user_message = {
+                        "role": "user",
+                        "content": image_content_list + [{"type": "text", "text": "<image>" + question_text}]
+                    }
+                    conversation = [system_message, user_message]
+                    final_conversations.append(conversation)
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(final_conversations, f, indent=4, ensure_ascii=False)
+
+    print(f"Success! Generated {len(final_conversations)} image-only conversational VQA entries.")
+    print(f"Formatted data saved to: {output_path}")
+    
+# --- Main Execution Block ---
+if __name__ == "__main__":
+    
+    # Define file paths
+    IMAGE_ROOT = '/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_1'
+    LABELS_FILE = os.path.join(IMAGE_ROOT, 'label_data.json')
+    SYSTEM_PROMPT_FILE = '/home/nguyendc/phong-dev/distill/prompt/system_prompt.txt'
+    QUESTION_BANK_FILE = '/home/nguyendc/phong-dev/distill/prompt/question_bank.json'
+    ANSWER_BANK_FILE = '/home/nguyendc/phong-dev/distill/prompt/answer_bank.json'
+    OUTPUT_FILE = os.path.join(IMAGE_ROOT, 'vqa_nolabel.json')
+    
+    # Run the main generation function
+    # generate_field_level_conversations(LABELS_FILE, IMAGE_ROOT, SYSTEM_PROMPT_FILE, QUESTION_BANK_FILE, ANSWER_BANK_FILE, OUTPUT_FILE)
+    generate_image_only_conversations(IMAGE_ROOT, SYSTEM_PROMPT_FILE, QUESTION_BANK_FILE, OUTPUT_FILE)
--- a/easydistill/mmkd/dev-vqa/qa_bank/answer_bank.json
+++ b/easydistill/mmkd/dev-vqa/qa_bank/answer_bank.json
@@ -0,0 +1,597 @@
+{
+    "is_bill": {
+        "english": {
+            "true": [
+                "Yes, this document is an invoice.",
+                "Correct, this appears to be a bill.",
+                "Confirmed, I've identified it as a billing statement.",
+                "Yes, I can confirm this is an invoice.",
+                "That's right, it's a bill."
+            ],
+            "false": [
+                "No, this does not appear to be an invoice.",
+                "I don't believe this is a bill.",
+                "This document seems to be something other than an invoice.",
+                "No, this is not a billing statement.",
+                "I would classify this as something other than a bill."
+            ]
+        },
+        "french": {
+            "true": [
+                "Oui, ce document est une facture.",
+                "Correct, il s'agit bien d'une facture.",
+                "Confirmé, je l'ai identifié comme un relevé de facturation.",
+                "Oui, je peux confirmer que c'est une facture.",
+                "C'est exact, c'est une note de frais."
+            ],
+            "false": [
+                "Non, ce document ne semble pas être une facture.",
+                "Je ne pense pas qu'il s'agisse d'une facture.",
+                "Ce document semble être autre chose qu'une facture.",
+                "Non, il ne s'agit pas d'un relevé de facturation.",
+                "Je ne classifierais pas ce document comme une facture."
+            ]
+        }
+    },
+    "profession": {
+        "english": [
+            "The healthcare provider's profession is {value}.",
+            "This document is from a {value}.",
+            "The medical specialty mentioned is {value}.",
+            "I have identified the profession as {value}."
+        ],
+        "french": [
+            "La profession du professionnel de santé est {value}.",
+            "Ce document provient d'un {value}.",
+            "La spécialité médicale mentionnée est {value}.",
+            "J'ai identifié la profession comme étant {value}."
+        ],
+        "null": {
+            "english": [
+                "I could not determine the specific profession from this document.",
+                "The healthcare provider's profession is not mentioned.",
+                "The document does not specify a medical specialty."
+            ],
+            "french": [
+                "Je n'ai pas pu déterminer la profession spécifique à partir de ce document.",
+                "La profession du professionnel de santé n'est pas mentionnée.",
+                "Le document ne spécifie pas de spécialité médicale."
+            ]
+        }
+    },
+    "adeli_number": {
+        "english": [
+            "The Adeli number is {value}.",
+            "I found the Adeli number: {value}.",
+            "The provider's Adeli registration number is {value}.",
+            "The 9-digit Adeli identifier is {value}."
+        ],
+        "french": [
+            "Le numéro Adeli est {value}.",
+            "J'ai trouvé le numéro Adeli : {value}.",
+            "Le numéro d'enregistrement Adeli du professionnel est {value}.",
+            "L'identifiant Adeli à 9 chiffres est {value}."
+        ],
+        "null": {
+            "english": [
+                "I could not find an Adeli number on this document.",
+                "The provider's Adeli number is not mentioned.",
+                "There is no Adeli number listed."
+            ],
+            "french": [
+                "Je n'ai pas pu trouver de numéro Adeli sur ce document.",
+                "Le numéro Adeli du professionnel n'est pas mentionné.",
+                "Aucun numéro Adeli n'est indiqué."
+            ]
+        }
+    },
+    "rpps_number": {
+        "english": [
+            "The RPPS number is {value}.",
+            "The 11-digit RPPS identifier is {value}.",
+            "I found the RPPS code: {value}.",
+            "The provider's RPPS number is listed as {value}."
+        ],
+        "french": [
+            "Le numéro RPPS est {value}.",
+            "L'identifiant RPPS à 11 chiffres est {value}.",
+            "J'ai trouvé le code RPPS : {value}.",
+            "Le numéro RPPS du professionnel est {value}."
+        ],
+        "null": {
+            "english": [
+                "I could not find an RPPS number on this document.",
+                "The RPPS number is not mentioned.",
+                "No RPPS identifier is listed in the document."
+            ],
+            "french": [
+                "Je n'ai pas pu trouver de numéro RPPS sur ce document.",
+                "Le numéro RPPS n'est pas mentionné.",
+                "Aucun identifiant RPPS n'est indiqué dans le document."
+            ]
+        }
+    },
+    "finess_number": {
+        "english": [
+            "The FINESS number is {value}.",
+            "The 9-digit FINESS identifier is {value}.",
+            "I found the FINESS code: {value}.",
+            "The healthcare facility's FINESS number is {value}."
+        ],
+        "french": [
+            "Le numéro FINESS est {value}.",
+            "L'identifiant FINESS à 9 chiffres est {value}.",
+            "J'ai trouvé le code FINESS : {value}.",
+            "Le numéro FINESS de l'établissement de santé est {value}."
+        ],
+        "null": {
+            "english": [
+                "I could not find a FINESS number on this document.",
+                "The FINESS number is not mentioned.",
+                "No FINESS identifier is listed in the document."
+            ],
+            "french": [
+                "Je n'ai pas pu trouver de numéro FINESS sur ce document.",
+                "Le numéro FINESS n'est pas mentionné.",
+                "Aucun identifiant FINESS n'est indiqué dans le document."
+            ]
+        }
+    },
+    "doctor_name": {
+        "english": [
+            "The doctor's name is {value}.",
+            "The document mentions Dr. {value}.",
+            "The healthcare provider is listed as {value}.",
+            "I found the name: {value}."
+        ],
+        "french": [
+            "Le nom du médecin est {value}.",
+            "Le document mentionne le Dr {value}.",
+            "Le professionnel de santé est {value}.",
+            "J'ai trouvé le nom : {value}."
+        ],
+        "null": {
+            "english": [
+                "I could not find a doctor's name on this document.",
+                "The doctor's name is not specified.",
+                "No doctor is mentioned in the document."
+            ],
+            "french": [
+                "Je n'ai pas pu trouver de nom de médecin sur ce document.",
+                "Le nom du médecin n'est pas spécifié.",
+                "Aucun médecin n'est mentionné dans le document."
+            ]
+        }
+    },
+    "prescripteur_finess_number": {
+        "english": [
+            "The prescriber's FINESS number is {value}.",
+            "The FINESS number for the prescriber is {value}.",
+            "I found the prescriber's FINESS code: {value}."
+        ],
+        "french": [
+            "Le numéro FINESS du prescripeg est {value}.",
+            "Le code FINESS du prescripteur est {value}.",
+            "J'ai trouvé le numéro FINESS du prescripteur : {value}."
+        ],
+        "null": {
+            "english": [
+                "I could not find the prescriber's FINESS number on this document.",
+                "The prescriber's FINESS number is not mentioned.",
+                "No prescriber FINESS identifier is listed."
+            ],
+            "french": [
+                "Je n'ai pas pu trouver le numéro FINESS du prescripteur sur ce document.",
+                "Le numéro FINESS du prescripteur n'est pas mentionné.",
+                "Aucun identifiant FINESS de prescripteur n'est indiqué."
+            ]
+        }
+    },
+    "total_billed": {
+        "english": [
+            "The total billed amount is {value}.",
+            "The grand total comes to {value}.",
+            "The total amount due is {value}.",
+            "The invoice total is {value}."
+        ],
+        "french": [
+            "Le montant total facturé est de {value}.",
+            "Le total général s'élève à {value}.",
+            "Le montant total dû est de {value}.",
+            "Le total de la facture est de {value}."
+        ],
+        "null": {
+            "english": [
+                "I could not find a total billed amount on the invoice.",
+                "The total amount is not specified.",
+                "The document does not mention a grand total."
+            ],
+            "french": [
+                "Je n'ai pas pu trouver de montant total facturé sur la facture.",
+                "Le montant total n'est pas spécifié.",
+                "Le document ne mentionne pas de total général."
+            ]
+        }
+    },
+    "bill_paid": {
+        "english": {
+            "true": [
+                "Yes, the invoice has been paid.",
+                "Correct, this bill is marked as settled.",
+                "The document confirms that payment has been received.",
+                "Yes, it appears to be acquitted."
+            ],
+            "false": [
+                "No, the invoice does not appear to be paid.",
+                "The payment status is still outstanding.",
+                "I could not find any indication that the bill has been paid.",
+                "No, this is a payment request, not a receipt of payment."
+            ]
+        },
+        "french": {
+            "true": [
+                "Oui, la facture a été payée.",
+                "Correct, cette facture est marquée comme acquittée.",
+                "Le document confirme que le règlement a été reçu.",
+                "Oui, il s'agit bien d'une quittance."
+            ],
+            "false": [
+                "Non, la facture ne semble pas avoir été payée.",
+                "Le statut du paiement est toujours en attente.",
+                "Je n'ai trouvé aucune indication que la facture a été réglée.",
+                "Non, il s'agit d'une demande de paiement, pas d'un reçu."
+            ]
+        }
+    },
+    "amount_paid": {
+        "english": [
+            "The amount paid was {value}.",
+            "A payment of {value} was received.",
+            "The document shows that {value} has been paid.",
+            "The settled amount is {value}."
+        ],
+        "french": [
+            "Le montant payé était de {value}.",
+            "Un paiement de {value} a été reçu.",
+            "Le document indique que {value} a été payé.",
+            "Le montant réglé est de {value}."
+        ],
+        "null": {
+            "english": [
+                "No payment amount is specified on the document.",
+                "I could not find the amount that was paid.",
+                "The document does not mention a specific payment amount."
+            ],
+            "french": [
+                "Aucun montant de paiement n'est spécifié sur le document.",
+                "Je n'ai pas pu trouver le montant qui a été payé.",
+                "Le document ne mentionne pas de montant de paiement spécifique."
+            ]
+        }
+    },
+    "mandatory_coverage": {
+        "english": [
+            "The mandatory coverage amount is {value}.",
+            "The amount covered by the compulsory insurance (AMO/RO) is {value}.",
+            "The reimbursement from the mandatory scheme is {value}.",
+            "The 'Part RO' is listed as {value}."
+            ],
+        "french": [
+            "Le montant de la couverture obligatoire est de {value}.",
+            "La part remboursée par le Régime Obligatoire (RO) s'élève à {value}.",
+            "Le remboursement de la part AMO est de {value}.",
+            "La 'Part RO' est de {value}."
+            ],
+        "null": {
+            "english": [
+                "The mandatory coverage amount is not specified.",
+                "I could not find the amount for the compulsory insurance portion.",
+                "The document does not list a reimbursement amount for the 'Régime Obligatoire'."
+            ],
+            "french": [
+                "Le montant de la couverture obligatoire n'est pas spécifié.",
+                "Je n'ai pas pu trouver le montant pour la part obligatoire.",
+                "Le document n'indique pas de montant pour le remboursement du Régime Obligatoire (RO)."
+            ]
+        }
+    },
+    "complementary_coverage": {
+        "english": [
+            "The complementary coverage amount is {value}.",
+            "The amount covered by the 'Mutuelle' is {value}.",
+            "The reimbursement from the supplemental insurance (AMC) is {value}.",
+            "The 'Part RC' is listed as {value}."
+        ],
+        "french": [
+            "Le montant de la couverture complémentaire est de {value}.",
+            "La part remboursée par la mutuelle s'élève à {value}.",
+            "Le remboursement de l'assurance complémentaire (AMC) est de {value}.",
+            "La 'Part RC' est de {value}."
+        ],
+        "null": {
+            "english": [
+                "The complementary coverage amount is not specified.",
+                "I could not find the amount for the supplemental insurance portion.",
+                "The document does not list a reimbursement amount for the 'Mutuelle' or 'AMC'."
+            ],
+            "french": [
+                "Le montant de la couverture complémentaire n'est pas spécifié.",
+                "Je n'ai pas pu trouver le montant pour la part complémentaire.",
+                "Le document n'indique pas de montant pour le remboursement de la mutuelle (AMC)."
+            ]
+        }
+    },
+    "client_part": {
+        "english": [
+            "The amount to be paid by the client is {value}.",
+            "The client's share comes to {value}.",
+            "The out-of-pocket amount is {value}.",
+            "The 'Part Assuré' is listed as {value}."
+        ],
+        "french": [
+            "Le montant à la charge du client est de {value}.",
+            "La part client s'élève à {value}.",
+            "Le reste à charge est de {value}.",
+            "La 'Part Assuré' est de {value}."
+        ],
+        "null": {
+            "english": [
+                "The client's share is not specified on the document.",
+                "I could not find the out-of-pocket amount.",
+                "The document does not list an amount for the 'Part Client'."
+            ],
+            "french": [
+                "La part client n'est pas spécifiée sur le document.",
+                "Je n'ai pas pu trouver le montant du reste à charge.",
+                "Le document n'indique pas de montant pour la 'Part Client'."
+            ]
+        }
+    },
+    "remaining_payment": {
+        "english": [
+            "The remaining balance to be paid is {value}.",
+            "There is still {value} owed on this invoice.",
+            "The outstanding balance is {value}.",
+            "The amount left to pay is {value}."
+        ],
+        "french": [
+            "Le reste à payer est de {value}.",
+            "Il reste encore {value} à régler sur cette facture.",
+            "Le solde restant dû est de {value}.",
+            "Le montant restant à payer s'élève à {value}."
+        ],
+        "null": {
+            "english": [
+                "There is no remaining balance indicated.",
+                "The invoice appears to be fully settled, or the remaining amount is not specified.",
+                "I could not find an outstanding balance on the document."
+            ],
+            "french": [
+                "Aucun reste à payer n'est indiqué.",
+                "La facture semble être entièrement réglée, ou le montant restant n'est pas spécifié.",
+                "Je n'ai pas pu trouver de solde restant dû sur le document."
+            ]
+        }
+    },
+    "insured_name": {
+        "english": [
+            "The insured person's name is {value}.",
+            "The policyholder is listed as {value}.",
+            "The name of the insured is {value}.",
+            "The document is for an insured person named {value}."
+        ],
+        "french": [
+            "Le nom de l'assuré est {value}.",
+            "Le titulaire de la police est {value}.",
+            "Le nom de la personne assurée est {value}.",
+            "Le document concerne un assuré nommé {value}."
+        ],
+        "null": {
+            "english": [
+                "The insured person's name is not specified on the document.",
+                "I could not find the name of the policyholder.",
+                "The document does not mention who is insured."
+            ],
+            "french": [
+                "Le nom de l'assuré n'est pas spécifié sur le document.",
+                "Je n'ai pas pu trouver le nom du titulaire de la police.",
+                "Le document ne mentionne pas qui est l'assuré."
+            ]
+        }
+    },
+    "insured_dob": {
+        "english": [
+            "The insured person's date of birth is {value}.",
+            "The date of birth for the insured is listed as {value}.",
+            "The insured was born on {value}."
+        ],
+        "french": [
+            "La date de naissance de l'assuré est le {value}.",
+            "L'assuré(e) est né(e) le {value}.",
+            "La date de naissance indiquée pour l'assuré est le {value}."
+        ],
+        "null": {
+            "english": [
+                "The insured person's date of birth is not specified.",
+                "I could not find the date of birth for the insured person.",
+                "The document does not mention the insured's date of birth."
+            ],
+            "french": [
+                "La date de naissance de l'assuré n'est pas spécifiée.",
+                "Je n'ai pas pu trouver la date de naissance de l'assuré.",
+                "Le document ne mentionne pas la date de naissance de la personne assurée."
+            ]
+        }
+    },
+    "beneficiary_name": {
+        "english": [
+            "The beneficiary's name is {value}.",
+            "The services were provided to {value}.",
+            "The patient is listed as {value}.",
+            "The invoice is for {value}."
+        ],
+        "french": [
+            "Le nom du bénéficiaire est {value}.",
+            "Les services ont été fournis à {value}.",
+            "Le patient est {value}.",
+            "La facture est au nom de {value}."
+        ],
+        "null": {
+            "english": [
+                "The beneficiary's name is not specified on the document.",
+                "I could not find the name of the patient or service recipient.",
+                "The document does not mention a beneficiary."
+            ],
+            "french": [
+                "Le nom du bénéficiaire n'est pas spécifié sur le document.",
+                "Je n'ai pas pu trouver le nom du patient ou du bénéficiaire des soins.",
+                "Le document ne mentionne pas de bénéficiaire."
+            ]
+        }
+    },
+    "beneficiary_dob": {
+        "english": [
+            "The beneficiary's date of birth is {value}.",
+            "The patient was born on {value}.",
+            "The date of birth for the beneficiary is listed as {value}."
+        ],
+        "french": [
+            "La date de naissance du bénéficiaire est le {value}.",
+            "Le patient est né(e) le {value}.",
+            "La date de naissance indiquée pour le bénéficiaire est le {value}."
+        ],
+        "null": {
+            "english": [
+                "The beneficiary's date of birth is not specified.",
+                "I could not find the date of birth for the patient.",
+                "The document does not mention the beneficiary's date of birth."
+            ],
+            "french": [
+                "La date de naissance du bénéficiaire n'est pas spécifiée.",
+                "Je n'ai pas pu trouver la date de naissance du patient.",
+                "Le document ne mentionne pas la date de naissance du bénéficiaire."
+            ]
+        }
+    },
+    "invoice_date": {
+        "english": [
+            "The invoice is dated {value}.",
+            "The issue date of the invoice is {value}.",
+            "The document date is listed as {value}."
+        ],
+        "french": [
+            "La facture est datée du {value}.",
+            "La date d'émission de la facture est le {value}.",
+            "La date du document est le {value}."
+        ],
+        "null": {
+            "english": [
+                "The invoice date is not specified on the document.",
+                "I could not find the issue date for this invoice.",
+                "The document does not have a date."
+            ],
+            "french": [
+                "La date de la facture n'est pas spécifiée sur le document.",
+                "Je n'ai pas pu trouver la date d'émission de cette facture.",
+                "Le document n'a pas de date."
+            ]
+        }
+    },
+    "security_number": {
+        "english": [
+            "The Social Security number is {value}.",
+            "The 'N° SS' is listed as {value}.",
+            "I found the Social Security number: {value}."
+        ],
+        "french": [
+            "Le numéro de Sécurité Sociale est {value}.",
+            "Le 'N° SS' est {value}.",
+            "J'ai trouvé le numéro de Sécurité Sociale : {value}."
+        ],
+        "null": {
+            "english": [
+                "I could not find a Social Security number on this document.",
+                "The Social Security number ('N° SS') is not mentioned.",
+                "No INSEE or Social Security number is listed."
+            ],
+            "french": [
+                "Je n'ai pas pu trouver de numéro de Sécurité Sociale sur ce document.",
+                "Le numéro de Sécurité Sociale ('N° SS') n'est pas mentionné.",
+                "Aucun numéro INSEE ou de Sécurité Sociale n'est indiqué."
+            ]
+        }
+    },
+    "invoice_issuer": {
+        "english": [
+            "The invoice was issued by {value}.",
+            "The service provider is {value}.",
+            "The invoice is from {value}.",
+            "The issuer is listed as {value}."
+        ],
+        "french": [
+            "La facture a été émise par {value}.",
+            "Le prestataire de services est {value}.",
+            "La facture provient de {value}.",
+            "L'émetteur de la facture est {value}."
+        ],
+        "null": {
+            "english": [
+                "The issuer's name is not specified on the document.",
+                "I could not find the name of the service provider.",
+                "The document does not mention who issued the invoice."
+            ],
+            "french": [
+                "Le nom de l'émetteur n'est pas spécifié sur le document.",
+                "Je n'ai pas pu trouver le nom du prestataire de services.",
+                "Le document ne mentionne pas qui a émis la facture."
+            ]
+        }
+    },
+    "currency": {
+        "english": [
+            "The currency used is {value}.",
+            "The amounts are listed in {value}.",
+            "The currency symbol found is {value}."
+        ],
+        "french": [
+            "La devise utilisée est {value}.",
+            "Les montants sont indiqués en {value}.",
+            "Le symbole monétaire trouvé est {value}."
+        ],
+        "null": {
+            "english": [
+                "The currency is not specified on the document.",
+                "I could not identify the currency used.",
+                "No currency symbol or code was found."
+            ],
+            "french": [
+                "La devise n'est pas spécifiée sur le document.",
+                "Je n'ai pas pu identifier la devise utilisée.",
+                "Aucun symbole ou code de devise n'a été trouvé."
+            ]
+        }
+    },
+    "items": {
+        "english": [
+            "Here is a breakdown of the services on the invoice:\n{value}",
+            "The following services were billed:\n{value}",
+            "Certainly, here are the details for each service item:\n{value}"
+        ],
+        "french": [
+            "Voici le détail des services figurant sur la facture :\n{value}",
+            "Les services suivants ont été facturés :\n{value}",
+            "Bien sûr, voici le détail de chaque service :\n{value}"
+        ],
+        "null": {
+            "english": [
+                "I could not find any detail services on this invoice.",
+                "The document does not appear to list any specific services or items."
+            ],
+            "french": [
+                "Je n'ai trouvé aucune de service sur cette facture.",
+                " Le document ne semble pas répertorier de services ou d'articles spécifiques."
+            ]
+        }
+    }
+}
+
--- a/easydistill/mmkd/dev-vqa/qa_bank/question_bank.json
+++ b/easydistill/mmkd/dev-vqa/qa_bank/question_bank.json
@@ -0,0 +1,459 @@
+{
+    "is_bill": {
+        "english": [
+            "Is this document an invoice?",
+            "Does this document appear to be a bill?",
+            "Can you confirm if this document is a billing statement?",
+            "Is this a financial invoice?",
+            "Does the image show a bill or an invoice?",
+            "Would you classify this as an invoice?",
+            "Is the document shown a medical bill or invoice?",
+            "Is this an official invoice document?",
+            "Does the document indicate a payment request?",
+            "Is this a billing-related document?"
+        ],
+        "french": [
+            "Ce document est-il une facture ?",
+            "Ce document semble-t-il être une note de frais ?",
+            "Pouvez-vous confirmer si ce document est un relevé de facturation ?",
+            "S'agit-il d'une facture financière ?",
+            "L'image montre-t-elle une facture ou une note de frais ?",
+            "Classeriez-vous ce document comme une facture ?",
+            "Est-ce une facture ou note médicale ?",
+            "S'agit-il d'un document de facturation officiel ?",
+            "Le document indique-t-il une demande de paiement ?",
+            "Est-ce un document lié à une facturation ?"
+        ]
+    },
+    "profession": {
+        "english": [
+            "What is the type of healthcare profession mentioned in this document?",
+            "Does this document concern a specific medical profession? If yes, which one?",
+            "Can you identify the profession of the healthcare provider in this document?",
+            "Is this document related to a particular medical profession?",
+            "Which medical or paramedical field does this document refer to?",
+            "Is this document issued by an optician, physiotherapist, pharmacist, or another healthcare professional?",
+            "Does this document involve a profession like psychology, radiology, or dentistry, ...?",
+            "Which healthcare provider is associated with this invoice or document?",
+            "Does the document mention a profession like gynecology, dietetics, or osteopathy, ...?",
+            "Can you determine the medical field (e.g., nurse, speech therapy, etc.) from the document?"
+        ],
+        "french": [
+            "Quel est le type de profession de santé mentionné dans ce document ?",
+            "Ce document concerne-t-il une profession médicale spécifique ? Si oui, laquelle ?",
+            "Pouvez-vous identifier la profession du professionnel de santé indiquée dans ce document ?",
+            "Ce document fait-il référence à une spécialité médicale ?",
+            "De quel domaine médical ou paramédical s'agit-il ?",
+            "Ce document provient-il d'un opticien, kiné, pharmacien, ou autre profession de santé ?",
+            "Ce document est-il lié à une profession comme la psychologie, la radiologie, ou la dentisterie ?",
+            "Quel professionnel de santé est concerné par cette facture ou ce document ?",
+            "Ce document indique-t-il une spécialité comme la gynécologie, la diététique, ou l'ostéopathie ?",
+            "Peut-on déduire le domaine médical (ex: infirmier, orthophonie, etc.) à partir du document ?"
+        ]
+    },
+    "adeli_number": {
+        "english": [
+            "What is the Adeli number mentioned in the document?",
+            "Is there an Adeli number associated with the healthcare provider?",
+            "Can you extract the Adeli number (9-digit identifier) from this document?",
+            "Does the document include the provider's Adeli number?",
+            "What 9-digit Adeli identifier appears in the document, if any?",
+            "Is the healthcare provider's Adeli code visible in the document?",
+            "Can you find and provide the Adeli registration number?",
+            "Does this document contain a professional Adeli ID?"
+        ],
+        "french": [
+            "Quel est le numéro Adeli mentionné dans le document ?",
+            "Y a-t-il un numéro Adeli associé au professionnel de santé ?",
+            "Pouvez-vous extraire le numéro Adeli (identifiant à 9 chiffres) de ce document ?",
+            "Le document inclut-il le numéro Adeli du praticien ?",
+            "Quel est l'identifiant Adeli à 9 chiffres indiqué, s'il y en a un ?",
+            "Le code Adeli du professionnel est-il visible dans le document ?",
+            "Pouvez-vous retrouver le numéro d'enregistrement Adeli ?",
+            "Ce document contient-il un identifiant professionnel Adeli ?"
+        ]
+    },
+    "rpps_number": {
+        "english": [
+            "What is the RPPS number mentioned in the document?",
+            "Can you extract the RPPS number (11-digit identifier)?",
+            "Does the document contain an RPPS identifier?",
+            "What 11-digit RPPS code appears in the text?",
+            "Is there a healthcare provider RPPS number indicated?",
+            "Can you find the RPPS number listed after the term 'RPPS'?",
+            "Does the document specify a professional RPPS ID?",
+            "Is the RPPS code visible in the document content?"
+        ],
+        "french": [
+            "Quel est le numéro RPPS mentionné dans le document ?",
+            "Pouvez-vous extraire le numéro RPPS (identifiant à 11 chiffres) ?",
+            "Le document contient-il un identifiant RPPS ?",
+            "Quel est le code RPPS à 11 chiffres indiqué dans le texte ?",
+            "Y a-t-il un numéro RPPS pour le professionnel de santé ?",
+            "Pouvez-vous retrouver le numéro RPPS indiqué après le mot 'RPPS' ?",
+            "Le document spécifie-t-il un identifiant professionnel RPPS ?",
+            "Le code RPPS est-il visible dans le contenu du document ?"
+        ]
+    },
+    "finess_number": {
+        "english": [
+            "What is the FINESS number mentioned in the document?",
+            "Can you extract the 9-digit FINESS identifier?",
+            "Is there a FINESS number provided in the document?",
+            "What 9-digit number follows the term 'finess'?",
+            "Does the document include a CPAM identifier or FINESS code?",
+            "Can you find the number indicated after 'finess' or 'identifiant CPAM'?",
+            "Is a healthcare facility identifier (FINESS) present?",
+            "Which number is listed after the term 'identifiant CPAM'?"
+        ],
+        "french": [
+            "Quel est le numéro FINESS mentionné dans le document ?",
+            "Pouvez-vous extraire l'identifiant FINESS à 9 chiffres ?",
+            "Le document contient-il un numéro FINESS ?",
+            "Quel est le numéro à 9 chiffres après le terme 'finess' ?",
+            "Le document comporte-t-il un identifiant CPAM ou un code FINESS ?",
+            "Pouvez-vous trouver le numéro indiqué après 'finess' ou 'identifiant CPAM' ?",
+            "Un identifiant de l'établissement de santé (FINESS) est-il présent ?",
+            "Quel numéro est mentionné après le terme 'identifiant CPAM' ?"
+        ]
+    },
+    "doctor_name": {
+        "english": [
+            "What is the full name of the doctor?",
+            "Can you extract the doctor's full name?",
+            "Who is the doctor mentioned in the document?",
+            "What doctor is listed on the document?",
+            "Can you provide the complete name of the healthcare provider?",
+            "What is the name of the medical professional on this document?",
+            "Is there a doctor's name written in the document?",
+            "Which doctor signed or issued this document?"
+        ],
+        "french": [
+            "Quel est le nom complet du médecin ?",
+            "Pouvez-vous extraire le nom complet du médecin ?",
+            "Quel médecin est mentionné dans le document ?",
+            "Quel est le nom du médecin indiqué sur le document ?",
+            "Pouvez-vous fournir le nom complet du professionnel de santé ?",
+            "Quel est le nom du professionnel médical dans ce document ?",
+            "Y a-t-il un nom de médecin inscrit dans le document ?",
+            "Quel médecin a signé ou délivré ce document ?"
+        ]
+    },
+    "prescripteur_finess_number": {
+        "english": [
+            "What is the prescriber's FINESS number?",
+            "Can you extract the FINESS number of the prescriber?",
+            "Find the prescriber's FINESS identification number.",
+            "What is the 9-digit FINESS identifier listed on the document?",
+            "Can you provide the FINESS ID of the prescriber?",
+            "What is the number associated with the term 'FINESS'?",
+            "Is there a prescriber FINESS number written on the invoice?",
+            "Extract the prescriber's FINESS code."
+        ],
+        "french": [
+            "Quel est le numéro FINESS du prescripteur ?",
+            "Pouvez-vous extraire le numéro FINESS du prescripteur ?",
+            "Trouvez le numéro d'identification FINESS du prescripteur.",
+            "Quel est l'identifiant FINESS à 9 chiffres indiqué sur le document ?",
+            "Pouvez-vous fournir l'ID FINESS du prescripteur ?",
+            "Quel est le numéro associé au terme 'FINESS' ?",
+            "Y a-t-il un numéro FINESS de prescripteur inscrit sur la facture ?",
+            "Extraire le code FINESS du prescripteur."
+        ]
+    },
+    "total_billed": {
+        "english": [
+            "What is the total billed amount?",
+            "Can you extract the total amount from the invoice?",
+            "What is the grand total?",
+            "How much is the total billed?",
+            "Find the invoice total.",
+            "What is the final total amount to pay?",
+            "Extract the total amount due.",
+            "Provide the total sum of the invoice."
+        ],
+        "french": [
+            "Quel est le montant total facturé ?",
+            "Pouvez-vous extraire le montant total de la facture ?",
+            "Quel est le total général ?",
+            "À combien s'élève le total de la facture ?",
+            "Trouvez le montant total à payer.",
+            "Quel est le montant total final de la facture ?",
+            "Extraire le montant total dû.",
+            "Quelle est la somme totale de la facture ?"
+        ]
+    },
+    "bill_paid": {
+        "english": [
+            "Has this invoice been paid?",
+            "Is the bill marked as paid?",
+            "Does the document indicate that the payment has been settled?",
+            "Can you verify if this bill is acquitted?",
+            "Determine the payment status of this invoice.",
+            "Is this a receipt confirming payment?",
+            "Check if the bill has been cleared or paid.",
+            "Does the document mention terms like 'acquittée' or 'acquittée'?"
+        ],
+        "french": [
+            "Cette facture a-t-elle été payée ?",
+            "La facture est-elle marquée comme 'acquittée' ou 'acquittée' ?",
+            "Le document indique-t-il que le paiement a été reçu ?",
+            "Pouvez-vous vérifier si cette facture est une quittance ?",
+            "Déterminez le statut de paiement de cette facture.",
+            "S'agit-il d'un reçu confirmant le règlement ?",
+            "Vérifiez si le paiement de la facture a été effectué.",
+            "Le document certifie-t-il la réception du règlement ?"
+        ]
+    },
+    "amount_paid": {
+        "english": [
+            "What is the amount that was paid?",
+            "How much was paid on this invoice?",
+            "Can you extract the payment amount?",
+            "What is the value of the payment received?",
+            "Find the amount that has been settled or paid.",
+            "If a payment was made, what was the amount?",
+            "Extract the sum that was paid towards the bill.",
+            "What is the total of the payment received?"
+        ],
+        "french": [
+            "Quel est le montant qui a été payé ?",
+            "Combien a été payé sur cette facture ?",
+            "Pouvez-vous extraire le montant du paiement ?",
+            "Quelle est la valeur du règlement reçu ?",
+            "Trouvez le montant qui a été réglé.",
+            "Si un paiement a été effectué, quel en était le montant ?",
+            "Extraire la somme qui a été versée pour cette facture.",
+            "Quel est le total du paiement reçu ?"
+        ]
+    },
+    "mandatory_coverage": {
+        "english": [
+            "What is the mandatory coverage amount?",
+            "How much is covered by the compulsory health insurance ('AMO', 'Rbmt RO', 'CAISSE', 'Noemie', etc.)?",
+            "What is the amount listed next to 'AMO', 'Rbmt RO', 'CAISSE', 'Noemie', etc?",
+            "Find the reimbursement amount from the 'AMO', 'Rbmt RO', 'CAISSE', 'Noemie', etc. (mandatory part).",
+            "Extract the value for the 'Régime Obligatoire' ('AMO', 'Rbmt RO', 'CAISSE', 'Noemie', etc.) coverage.",
+            "What is the reimbursement amount from the mandatory scheme?",
+            "What value is associated with the Noemie mandatory reimbursement?",
+            "Provide the amount for the compulsory insurance portion."
+        ],
+        "french": [
+            "Quel est le montant de la part obligatoire ?",
+            "À combien s'élève le remboursement du Régime Obligatoire ('AMO', 'Rbmt RO', 'CAISSE', 'Noemie', etc.) ?",
+            "Quelle est la valeur indiquée pour 'AMO', 'Rbmt RO', 'CAISSE', 'Noemie', etc. ?",
+            "Extraire le montant remboursé par la 'AMO', 'Rbmt RO', 'CAISSE', 'Noemie', etc..",
+            "Trouvez le montant de la part remboursée par l'Assurance Maladie Obligatoire.",
+            "Quel est le montant du remboursement 'AMO', 'Rbmt RO', 'CAISSE', 'Noemie', etc. ?",
+            "À combien s'élève la part 'AMO', 'Rbmt RO', 'CAISSE', 'Noemie', etc. ?",
+            "Quel est le montant du remboursement obligatoire transmis via Noemie ?"
+        ]
+    },
+    "complementary_coverage": {
+        "english": [
+            "What is the amount of the complementary coverage, indicated by 'AMC', 'RC', or 'Mutuelle'?",
+            "Extract the reimbursement value from the supplemental insurance, looking for the terms 'AMC', 'RC', or 'Mutuelle'.",
+            "How much is the complementary reimbursement from the 'Mutuelle', which may be labeled 'AMC' or 'RC'?",
+            "Find the value for the complementary part (Mutuelle), which is identified by the labels 'AMC' or 'RC'.",
+            "Provide the amount paid by the complementary insurance ('Mutuelle'), listed under 'AMC' or 'RC'."
+        ],
+        "french": [
+            "Quel est le montant de la part complémentaire, indiqué par les termes 'AMC', 'RC' ou 'Mutuelle' ?",
+            "Extraire la valeur du remboursement de la 'Mutuelle', en recherchant les libellés 'AMC' ou 'RC'.",
+            "À combien s'élève le remboursement de l'assurance complémentaire, identifié par 'AMC', 'RC' ou 'Mutuelle' ?",
+            "Trouvez le montant remboursé par la 'Mutuelle', c'est-à-dire la part 'AMC' ou 'RC'.",
+            "Fournir le montant payé par l'assurance complémentaire ('Mutuelle'), listé sous 'AMC' ou 'RC'."
+        ]
+
+    },
+    "client_part": {
+        "english":[
+            "What is the amount to be paid by the client, indicated by 'ASSURE', 'Part Client', or 'Part Assuré'?",
+            "Extract the value of the client's share, looking for the labels 'Part Client', 'ASSURE', or 'Part Assuré'.",
+            "How much is the patient's remaining portion, which might be listed under 'ASSURE', 'Part Client', or 'Part Assuré'?",
+            "Find the amount for the 'Part Assuré', also referred to as 'Part Client' or 'ASSURE'.",
+            "Provide the out-of-pocket amount for the client, identified by terms like 'ASSURE', 'Part Client', or 'Part Assuré'."
+        ],
+        "french": [
+            "Quel est le montant de la part client, indiqué par 'ASSURE', 'Part Client', ou 'Part Assuré' ?",
+            "Extraire la valeur de la part de l'assuré, en recherchant les termes 'Part Assuré', 'Part Client', ou 'ASSURE'.",
+            "À combien s'élève le montant restant à la charge de l'assuré, indiqué sous les libellés 'ASSURE', 'Part Client', ou 'Part Assuré' ?",
+            "Trouvez le montant de la 'Part Client', aussi connu sous le nom de 'Part Assuré' ou 'ASSURE'.",
+            "Fournir le reste à charge pour le client, identifié par des termes comme 'ASSURE', 'Part Client', ou 'Part Assuré'."
+        ]
+    },
+    "remaining_payment": {
+        "english": [
+            "What is the remaining balance to be paid?",
+            "How much is still owed on this invoice?",
+            "What is the outstanding amount or balance due?",
+            "Extract the remaining payment amount.",
+            "If the invoice is not fully paid, what is the amount left to pay?",
+            "Find the amount yet to be paid by the beneficiary.",
+            "What is the remaining sum to be settled?",
+            "Provide the outstanding balance on the account."
+        ],
+        "french": [
+            "Quel est le reste à payer ?",
+            "Combien reste-t-il à régler sur cette facture ?",
+            "Quel est le solde restant dû ?",
+            "Extraire le montant du paiement restant.",
+            "Si la facture n'est pas entièrement réglée, quel est le montant restant ?",
+            "Trouvez le montant encore dû par le bénéficiaire.",
+            "À combien s'élève le montant restant à payer ?",
+            "Indiquer le solde impayé de la facture."
+        ]
+    },
+    "insured_name": {
+        "english": [
+            "What is the full name of the insured person?",
+            "Who is the insured individual named on the document?",
+            "Can you extract the insured's full name?",
+            "Find the name of the person who is insured.",
+            "What is the name of the 'Assuré' (the insured person)?",
+            "Provide the name of the policyholder or insured."
+        ],
+        "french": [
+            "Quel est le nom complet de l'assuré ?",
+            "Qui est la personne assurée mentionnée dans le document ?",
+            "Extraire le nom et prénom de l'assuré.",
+            "Trouvez le nom de la personne couverte par l'assurance.",
+            "Quel est le nom indiqué pour l'assuré principal ?",
+            "À quel nom la police d'assurance est-elle établie ?"
+        ]
+    },
+    "insured_dob": {
+        "english": [
+            "What is the insured person's date of birth in dd-mm-yyyy format?",
+            "Extract the insured's date of birth, ensuring it is in the dd-mm-yyyy format.",
+            "Find the date of birth for the insured and provide it as dd-mm-yyyy.",
+            "What is the insured's DOB, formatted as dd-mm-yyyy?",
+            "Provide the insured's date of birth using the day-month-year (dd-mm-yyyy) format."
+            ],
+        "french": [
+            "Quelle est la date de naissance de l'assuré au format dd-mm-yyyy ?",
+            "Extraire la date de naissance de l'assuré, en respectant le format dd-mm-yyyy.",
+            "Trouvez la date de naissance de la personne assurée et donnez-la au format dd-mm-yyyy.",
+            "Quelle est la date de naissance de l'assuré, formatée en dd-mm-yyyy ?",
+            "Fournir la date de naissance de l'assuré en utilisant le format jour-mois-année (dd-mm-yyyy)."
+        ]
+    },
+    "beneficiary_name": {
+        "english": [
+            "What is the full name of the beneficiary?",
+            "Who is the beneficiary named on the invoice?",
+            "Can you extract the beneficiary's full name?",
+            "What is the name of the patient or person who received the service?",
+            "Find the name of the service recipient.",
+            "For whom is this invoice intended? Provide the full name."
+        ],
+        "french": [
+            "Quel est le nom complet du bénéficiaire ?",
+            "Qui est le bénéficiaire de la facture ?",
+            "Extraire le nom et le prénom du bénéficiaire.",
+            "Quel est le nom du patient ou du bénéficiaire des soins ?",
+            "Trouvez le nom de la personne qui a reçu les services facturés.",
+            "À quel nom les services ou produits ont-ils été facturés ?"
+        ]
+    },
+    "beneficiary_dob": {
+        "english": [
+            "What is the beneficiary's date of birth in dd-mm-yyyy format?",
+            "Extract the date of birth for the beneficiary, ensuring it is formatted as dd-mm-yyyy.",
+            "Find the patient or beneficiary's date of birth and provide it as dd-mm-yyyy.",
+            "What is the beneficiary's DOB, formatted as dd-mm-yyyy?",
+            "What is the birth date of the person who received the care, in dd-mm-yyyy format?"
+        ],
+        "french": [
+            "Quelle est la date de naissance du bénéficiaire au format dd-mm-yyyy ?",
+            "Extraire la date de naissance du bénéficiaire, en respectant le format dd-mm-yyyy.",
+            "Trouvez la date de naissance du patient ou bénéficiaire et donnez-la au format dd-mm-yyyy.",
+            "Quelle est la date de naissance du bénéficiaire, formatée comme suit : dd-mm-yyyy ?",
+            "Fournir la date de naissance de la personne ayant reçu les soins, en utilisant le format jour-mois-année (dd-mm-yyyy)."
+            ]
+        },
+    "invoice_date": {
+        "english": [
+            "What is the invoice date, in dd-mm-yyyy format?",
+            "Extract the issue date of the invoice, formatted as dd-mm-yyyy.",
+            "What is the document's date, provided as dd-mm-yyyy?",
+            "Find the date the invoice was created, in dd-mm-yyyy format.",
+            "Provide the invoice date using the dd-mm-yyyy format."
+        ],
+        "french": [
+            "Quelle est la date de la facture, au format dd-mm-yyyy ?",
+            "Extraire la date d'émission de la facture, formatée en dd-mm-yyyy.",
+            "Quelle est la date du document, fournie en dd-mm-yyyy ?",
+            "Trouvez la date de création de la facture, au format dd-mm-yyyy.",
+            "Quelle est la date indiquée comme 'Fait le', au format dd-mm-yyyy ?"
+        ]
+    },
+    "security_number": {
+        "english": [
+            "What is the Social Security number, which may be labeled 'N° SS', 'N° INSEE', or 'Sécurité Sociale'?",
+            "Extract the 13 or 15 digit Social Security number (N° SS, 'N° INSEE', or 'Sécurité Sociale').",
+            "Find the identifier for 'Sécurité Sociale', which should be a 13 or 15 digit number.",
+            "What is the INSEE number ('N° INSEE') or Social Security Number ('N° SS') on the document?",
+            "Provide the 'N° SS' (Social Security number)."
+        ],
+        "french": [
+            "Quel est le numéro de Sécurité Sociale (N° S, 'N° INSEE', ou 'Sécurité Sociale') ?",
+            "Extraire le numéro INSEE à 13 ou 15 chiffres.",
+            "Trouvez le numéro de Sécurité Sociale, qui peut être indiqué comme 'N° SS' ou 'N° INSEE'.",
+            "Quel est le numéro d'immatriculation ou 'N° SS' présent sur le document ?",
+            "Fournir le numéro de Sécurité Sociale (13 ou 15 chiffres)."
+        ]
+    },
+    "invoice_issuer": {
+        "english": [
+            "Who issued the invoice?",
+            "What is the name of the service provider or organization?",
+            "Can you extract the name of the vendor or supplier?",
+            "From which company or individual did this invoice come?",
+            "Find the name of the invoice issuer."
+        ],
+        "french": [
+            "Qui est l'émetteur de la facture ?",
+            "Quel est le nom du fournisseur ou du prestataire de services ?",
+            "Pouvez-vous extraire le nom du vendeur ?",
+            "De quelle entreprise ou personne cette facture provient-elle ?",
+            "Trouver le nom de la société ou du professionnel qui a émis ce document."
+        ]
+    },
+    "currency": {
+        "english": [
+            "What is the currency used in the document (e.g., EUR, USD)?",
+            "Can you extract the currency symbol, like € or $?",
+            "In what currency are the invoice amounts listed?",
+            "Find the currency code or symbol.",
+            "Identify the monetary unit for the amounts shown."
+        ],
+        "french": [
+            "Quelle est la devise utilisée dans le document (par exemple, EUR, USD) ?",
+            "Pouvez-vous extraire le symbole de la devise, comme € ou $ ?",
+            "En quelle devise les montants de la facture sont-ils indiqués ?",
+            "Trouver le code ou le symbole de la devise.",
+            "Identifier l'unité monétaire des montants affichés."
+        ]
+    },
+    "items": {
+        "english": [
+            "List the descriptions of all services, sessions on the invoice.",
+            "What are the names of the services, sessions or products billed?",
+            "What services, sessions or products are billed in this document?",
+            "Identify each service or product in the invoice along with its quantity, date, insurance coverage, and total cost.",
+            "What services are detailed in the invoice?",
+            "Can you detail the billed services, their descriptions, dates, and amounts?",
+            "Give me all services or sessions billed, with their mandatory insurance coverage."
+        ],
+        "french": [
+            "Listez les descriptions de tous les services ou séances figurant sur la facture.",
+            "Quels sont les noms des services, séances ou produits facturés ?",
+            "Quels services, séances ou produits sont facturés dans ce document ?",
+            "Identifiez chaque service ou produit dans la facture ainsi que sa quantité, sa date, sa prise en charge par l'assurance et son coût total.",
+            "Quels services sont détaillés dans la facture ?",
+            "Pouvez-vous détailler les services facturés, leurs descriptions, leurs dates et leurs montants ?"
+        ]
+    }
+}
+
--- a/easydistill/mmkd/infer.log
+++ b/easydistill/mmkd/infer.log
@@ -0,0 +1,174 @@
+INFO 08-03 20:27:56 [importing.py:53] Triton module has been replaced with a placeholder.
+INFO 08-03 20:27:56 [__init__.py:239] Automatically detected platform cuda.
+2025-08-03 20:27:58,078 - INFO - Generating distillation data from the teacher model!
+2025-08-03 20:27:58,384 - INFO - Loading processor & vLLM model from Qwen/Qwen2.5-VL-32B-Instruct
+Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
+2025-08-03 20:28:00,580 - INFO - Initial eos_token_id 151645 from tokenizer
+2025-08-03 20:28:00,580 - INFO - processor.tokenizer eos_token: <|im_end|>, eos_token_id: 151645
+INFO 08-03 20:28:09 [config.py:717] This model supports multiple tasks: {'reward', 'classify', 'score', 'generate', 'embed'}. Defaulting to 'generate'.
+INFO 08-03 20:28:09 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=16384.
+INFO 08-03 20:28:11 [core.py:58] Initializing a V1 LLM engine (v0.8.5) with config: model='Qwen/Qwen2.5-VL-32B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-VL-32B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=16000, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=Qwen/Qwen2.5-VL-32B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"level":3,"custom_ops":["none"],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output"],"use_inductor":true,"compile_sizes":[],"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":512}
+WARNING 08-03 20:28:12 [utils.py:2522] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x72908ff5c0d0>
+INFO 08-03 20:28:13 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
+INFO 08-03 20:28:13 [cuda.py:221] Using Flash Attention backend on V1 engine.
+WARNING 08-03 20:28:20 [topk_topp_sampler.py:69] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
+INFO 08-03 20:28:20 [gpu_model_runner.py:1329] Starting to load model Qwen/Qwen2.5-VL-32B-Instruct...
+WARNING 08-03 20:28:20 [vision.py:93] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.
+INFO 08-03 20:28:20 [config.py:3614] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512] is overridden by config [512, 384, 256, 128, 4, 2, 1, 392, 264, 136, 8, 400, 272, 144, 16, 408, 280, 152, 24, 416, 288, 160, 32, 424, 296, 168, 40, 432, 304, 176, 48, 440, 312, 184, 56, 448, 320, 192, 64, 456, 328, 200, 72, 464, 336, 208, 80, 472, 344, 216, 88, 120, 480, 352, 248, 224, 96, 488, 504, 360, 232, 104, 496, 368, 240, 112, 376]
+INFO 08-03 20:28:21 [weight_utils.py:265] Using model weights format ['*.safetensors']
+
+Loading safetensors checkpoint shards:   0% Completed | 0/18 [00:00<?, ?it/s]
+
+Loading safetensors checkpoint shards:   6% Completed | 1/18 [00:01<00:18,  1.10s/it]
+
+Loading safetensors checkpoint shards:  11% Completed | 2/18 [00:01<00:12,  1.23it/s]
+
+Loading safetensors checkpoint shards:  17% Completed | 3/18 [00:02<00:14,  1.02it/s]
+
+Loading safetensors checkpoint shards:  22% Completed | 4/18 [00:04<00:14,  1.07s/it]
+
+Loading safetensors checkpoint shards:  28% Completed | 5/18 [00:05<00:14,  1.12s/it]
+
+Loading safetensors checkpoint shards:  33% Completed | 6/18 [00:06<00:13,  1.15s/it]
+
+Loading safetensors checkpoint shards:  39% Completed | 7/18 [00:07<00:12,  1.17s/it]
+
+Loading safetensors checkpoint shards:  44% Completed | 8/18 [00:08<00:11,  1.17s/it]
+
+Loading safetensors checkpoint shards:  50% Completed | 9/18 [00:10<00:10,  1.18s/it]
+
+Loading safetensors checkpoint shards:  56% Completed | 10/18 [00:11<00:09,  1.19s/it]
+
+Loading safetensors checkpoint shards:  61% Completed | 11/18 [00:12<00:08,  1.19s/it]
+
+Loading safetensors checkpoint shards:  67% Completed | 12/18 [00:13<00:07,  1.20s/it]
+
+Loading safetensors checkpoint shards:  72% Completed | 13/18 [00:15<00:06,  1.23s/it]
+
+Loading safetensors checkpoint shards:  78% Completed | 14/18 [00:16<00:04,  1.25s/it]
+
+Loading safetensors checkpoint shards:  83% Completed | 15/18 [00:17<00:03,  1.26s/it]
+
+Loading safetensors checkpoint shards:  89% Completed | 16/18 [00:18<00:02,  1.26s/it]
+
+Loading safetensors checkpoint shards:  94% Completed | 17/18 [00:19<00:01,  1.17s/it]
+
+Loading safetensors checkpoint shards: 100% Completed | 18/18 [00:21<00:00,  1.18s/it]
+
+Loading safetensors checkpoint shards: 100% Completed | 18/18 [00:21<00:00,  1.17s/it]
+
+INFO 08-03 20:28:42 [loader.py:458] Loading weights took 21.13 seconds
+INFO 08-03 20:28:42 [gpu_model_runner.py:1347] Model loading took 62.4365 GiB and 21.912121 seconds
+INFO 08-03 20:28:46 [gpu_model_runner.py:1620] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 1 image items of the maximum feature size.
+INFO 08-03 20:29:09 [backends.py:420] Using cache directory: /home/nguyendc/.cache/vllm/torch_compile_cache/1fe259ecb1/rank_0_0 for vLLM's torch.compile
+INFO 08-03 20:29:09 [backends.py:430] Dynamo bytecode transform time: 19.39 s
+INFO 08-03 20:29:22 [backends.py:118] Directly load the compiled graph(s) for shape None from the cache, took 11.165 s
+INFO 08-03 20:29:24 [monitor.py:33] torch.compile takes 19.39 s in total
+INFO 08-03 20:29:29 [kv_cache_utils.py:634] GPU KV cache size: 38,016 tokens
+INFO 08-03 20:29:29 [kv_cache_utils.py:637] Maximum concurrency for 16,000 tokens per request: 2.38x
+INFO 08-03 20:30:08 [gpu_model_runner.py:1686] Graph capturing finished in 39 secs, took 0.96 GiB
+INFO 08-03 20:30:08 [core.py:159] init engine (profile, create kv cache, warmup model) took 86.30 seconds
+INFO 08-03 20:30:12 [core_client.py:439] Core engine process 0 ready.
+2025-08-03 20:30:12,647 - INFO - Qwen2.5-VL vLLM model loaded successfully
+
+Generating responses:   0%|          | 0/40 [00:00<?, ?it/s]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:12<00:00, 12.96s/it, est. speed input: 272.51 toks/s, output: 20.14 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:12<00:00, 12.96s/it, est. speed input: 272.51 toks/s, output: 20.14 toks/s]
+
+Generating responses:   2%|▎         | 1/40 [00:16<10:29, 16.13s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.37s/it, est. speed input: 333.81 toks/s, output: 20.48 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.37s/it, est. speed input: 333.81 toks/s, output: 20.48 toks/s]
+
+Generating responses:   5%|▌         | 2/40 [00:23<06:58, 11.01s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.54s/it, est. speed input: 364.77 toks/s, output: 20.02 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.54s/it, est. speed input: 364.77 toks/s, output: 20.02 toks/s]
+
+Generating responses:   8%|▊         | 3/40 [00:31<05:50,  9.47s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.43s/it, est. speed input: 343.57 toks/s, output: 20.31 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.43s/it, est. speed input: 343.57 toks/s, output: 20.31 toks/s]
+
+Generating responses:  10%|█         | 4/40 [00:38<05:12,  8.69s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:08<00:00,  8.27s/it, est. speed input: 564.53 toks/s, output: 18.27 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:08<00:00,  8.27s/it, est. speed input: 564.53 toks/s, output: 18.27 toks/s]
+
+Generating responses:  12%|█▎        | 5/40 [00:47<05:02,  8.64s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.35s/it, est. speed input: 307.93 toks/s, output: 20.56 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.35s/it, est. speed input: 307.93 toks/s, output: 20.56 toks/s]
+
+Generating responses:  15%|█▌        | 6/40 [00:54<04:39,  8.21s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:08<00:00,  8.26s/it, est. speed input: 565.20 toks/s, output: 18.29 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:08<00:00,  8.26s/it, est. speed input: 565.20 toks/s, output: 18.29 toks/s]
+
+Generating responses:  18%|█▊        | 7/40 [01:03<04:34,  8.32s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.53s/it, est. speed input: 363.87 toks/s, output: 20.05 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.53s/it, est. speed input: 363.87 toks/s, output: 20.05 toks/s]
+
+Generating responses:  20%|██        | 8/40 [01:10<04:19,  8.10s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:08<00:00,  8.25s/it, est. speed input: 565.62 toks/s, output: 18.30 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:08<00:00,  8.25s/it, est. speed input: 565.62 toks/s, output: 18.30 toks/s]
+
+Generating responses:  22%|██▎       | 9/40 [01:19<04:15,  8.24s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.63s/it, est. speed input: 395.25 toks/s, output: 19.80 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.63s/it, est. speed input: 395.25 toks/s, output: 19.80 toks/s]
+
+Generating responses:  25%|██▌       | 10/40 [01:27<04:02,  8.08s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:17<00:00, 17.76s/it, est. speed input: 293.12 toks/s, output: 20.05 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:17<00:00, 17.76s/it, est. speed input: 293.12 toks/s, output: 20.05 toks/s]
+
+Generating responses:  28%|██▊       | 11/40 [01:45<05:22, 11.13s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:12<00:00, 12.45s/it, est. speed input: 276.12 toks/s, output: 20.48 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:12<00:00, 12.45s/it, est. speed input: 276.12 toks/s, output: 20.48 toks/s]
+
+Generating responses:  30%|███       | 12/40 [01:57<05:24, 11.57s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:15<00:00, 15.51s/it, est. speed input: 226.26 toks/s, output: 20.76 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:15<00:00, 15.51s/it, est. speed input: 226.26 toks/s, output: 20.76 toks/s]
+
+Generating responses:  32%|███▎      | 13/40 [02:13<05:45, 12.81s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
+
+Processed prompts: 100%|██████████| 1/1 [00:12<00:00, 12.27s/it, est. speed input: 278.40 toks/s, output: 20.45 toks/s][A
+Processed prompts: 100%|██████████| 1/1 [00:12<00:00, 12.27s/it, est. speed input: 278.40 toks/s, output: 20.45 toks/s]
+
+Generating responses:  35%|███▌      | 14/40 [02:25<05:29, 12.69s/it]
+
+Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
--- a/easydistill/mmkd/infer.py
+++ b/easydistill/mmkd/infer.py
@@ -182,8 +182,11 @@ def generate_teacher_logits_batch(processor, llm, data_list, config, batch_size=
                "multi_modal_data": mm_data,
            }
            new_batch.append(sample_inputs)
-        outputs = llm.generate(new_batch, sampling_params=sampling_params)
-        logits+=[output.outputs[0].logprobs for output in outputs]
+        try:
+            outputs = llm.generate(new_batch, sampling_params=sampling_params)
+            logits+=[output.outputs[0].logprobs for output in outputs]
+        except:
+            continue

        for b in range(len(batch_outcomes)):
       
@@ -273,7 +276,7 @@ def infer_with_teacher_model(config):
        elif job_type == "mmkd_white_box":
            
            tokenizer, llm = load_tokenizer_and_vllm(config)
-            generate_teacher_logits_batch(tokenizer, llm, data_list, config)
+            generate_teacher_logits_batch(tokenizer, llm, data_list, config, 1)
        else:
            logging.error(f"Invalid job type: {job_type}")
            raise ValueError(f"Invalid job type: {job_type}")
--- a/easydistill/mmkd/logits.json
+++ b/easydistill/mmkd/logits.json
--- a/easydistill/mmkd/mllm_demo_distill.json
+++ b/easydistill/mmkd/mllm_demo_distill.json
--- a/easydistill/mmkd/prompt_templates.json
+++ b/easydistill/mmkd/prompt_templates.json
@@ -0,0 +1,890 @@
+{
+    "templates": [
+        {
+            "prompts": {
+                "en": [
+                    "Get doctor information",
+                    "Who is the doctor?",
+                    "Provide doctor details"
+                ],
+                "fr": [
+                    "Obtenir les informations du médecin",
+                    "Qui est le médecin ?",
+                    "Fournir les détails du médecin"
+                ]
+            },
+            "group_name": "doctor_info",
+            "target_keys": [
+                "doctor_name",
+                "profession",
+                "adeli_number",
+                "invoice_issuer"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Who is the patient?",
+                    "Get patient info",
+                    "Show insured person's details"
+                ],
+                "fr": [
+                    "Qui est le patient ?",
+                    "Obtenir les infos du patient",
+                    "Afficher les détails de l'assuré"
+                ]
+            },
+            "group_name": "patient_info",
+            "target_keys": [
+                "insured_name",
+                "beneficiary_name",
+                "security_number"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Give me the billing summary",
+                    "Show the bill summary",
+                    "Billing overview"
+                ],
+                "fr": [
+                    "Donnez-moi le résumé de la facturation",
+                    "Afficher le résumé de la facture",
+                    "Aperçu de la facturation"
+                ]
+            },
+            "group_name": "billing_summary",
+            "target_keys": [
+                "invoice_date",
+                "total_billed",
+                "amount_paid",
+                "bill_paid",
+                "currency"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "What services were provided?",
+                    "List the services",
+                    "Show service details"
+                ],
+                "fr": [
+                    "Quels services ont été fournis ?",
+                    "Lister les services",
+                    "Afficher le détail des services"
+                ]
+            },
+            "group_name": "service_details",
+            "target_keys": [
+                "items"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "What is the payment status?",
+                    "Was the bill paid?",
+                    "Show payment details"
+                ],
+                "fr": [
+                    "Quel est le statut du paiement ?",
+                    "La facture a-t-elle été payée ?",
+                    "Afficher les détails du paiement"
+                ]
+            },
+            "group_name": "payment_status",
+            "target_keys": [
+                "bill_paid",
+                "amount_paid",
+                "total_billed",
+                "remaining_payment"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "List the doctor's professional numbers",
+                    "Get professional IDs"
+                ],
+                "fr": [
+                    "Lister les numéros professionnels du médecin",
+                    "Obtenir les identifiants professionnels"
+                ]
+            },
+            "group_name": "professional_ids",
+            "target_keys": [
+                "adeli_number",
+                "rpps_number",
+                "finess_number"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "What was the period of care?",
+                    "Show care dates"
+                ],
+                "fr": [
+                    "Quelle était la période de soins ?",
+                    "Afficher les dates de soins"
+                ]
+            },
+            "group_name": "care_period",
+            "target_keys": [
+                "care_start_date",
+                "care_end_date",
+                "items"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Show coverage and out-of-pocket costs",
+                    "Financial coverage details"
+                ],
+                "fr": [
+                    "Afficher la couverture et le reste à charge",
+                    "Détails de la couverture financière"
+                ]
+            },
+            "group_name": "financial_coverage",
+            "target_keys": [
+                "mandatory_coverage",
+                "out_of_pocket"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Show all details from the invoice",
+                    "Full invoice data"
+                ],
+                "fr": [
+                    "Afficher tous les détails de la facture",
+                    "Données complètes de la facture"
+                ]
+            },
+            "group_name": "invoice_details",
+            "target_keys": [
+                "invoice_date",
+                "invoice_issuer",
+                "total_billed",
+                "items"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Who was the beneficiary of the service?",
+                    "Get beneficiary info"
+                ],
+                "fr": [
+                    "Qui était le bénéficiaire du service ?",
+                    "Obtenir les informations du bénéficiaire"
+                ]
+            },
+            "group_name": "beneficiary_info",
+            "target_keys": [
+                "beneficiary_name",
+                "beneficiary_dob"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Get all financial fields",
+                    "Show all money fields"
+                ],
+                "fr": [
+                    "Obtenir tous les champs financiers",
+                    "Afficher tous les champs monétaires"
+                ]
+            },
+            "group_name": "full_financials",
+            "target_keys": [
+                "total_billed",
+                "amount_paid",
+                "mandatory_coverage",
+                "out_of_pocket",
+                "remaining_payment",
+                "currency"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Who is the provider?",
+                    "Who provided the service?"
+                ],
+                "fr": [
+                    "Qui est le prestataire ?",
+                    "Qui a fourni le service ?"
+                ]
+            },
+            "group_name": "provider_identity",
+            "target_keys": [
+                "doctor_name",
+                "profession"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Show patient identification details",
+                    "Patient ID info"
+                ],
+                "fr": [
+                    "Afficher les détails d'identification du patient",
+                    "Infos d'identification du patient"
+                ]
+            },
+            "group_name": "patient_identity",
+            "target_keys": [
+                "insured_name",
+                "security_number",
+                "insured_dob"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "How uch did the service item cost?",
+                    "Item cost"
+                ],
+                "fr": [
+                    "Combien a coûté l'acte ?",
+                    "Comût de l'acte"
+                ]
+            },
+            "group_name": "service_item_cost",
+            "target_keys": [
+                "items.amount",
+                "items.description"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "When was the specific service rendered?",
+                    "Date of service item"
+                ],
+                "fr": [
+                    "Quand le service spécifique a-t-il été rendu ?",
+                    "Date de l'acte"
+                ]
+            },
+            "group_name": "service_item_date",
+            "target_keys": [
+                "items.date_of_service",
+                "items.description"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Who created the bill?",
+                    "Who is the issuer?"
+                ],
+                "fr": [
+                    "Qui a créé la facture ?",
+                    "Qui est l'émetteur ?"
+                ]
+            },
+            "group_name": "invoice_issuer_details",
+            "target_keys": [
+                "invoice_issuer",
+                "invoice_date"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Is this a bill?",
+                    "What is the document type?"
+                ],
+                "fr": [
+                    "Est-ce une facture ?",
+                    "Quel est le type de document ?"
+                ]
+            },
+            "group_name": "document_type",
+            "target_keys": [
+                "is_bill"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Show all money-related fields",
+                    "List all monetary values"
+                ],
+                "fr": [
+                    "Afficher tous les champs liés à l'argent",
+                    "Lister toutes les valeurs monétaires"
+                ]
+            },
+            "group_name": "monetary_details",
+            "target_keys": [
+                "total_billed",
+                "amount_paid",
+                "out_of_pocket",
+                "remaining_payment",
+                "currency"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "List all dates on the document",
+                    "Show all dates"
+                ],
+                "fr": [
+                    "Lister toutes les dates sur le document",
+                    "Afficher toutes les dates"
+                ]
+            },
+            "group_name": "all_dates",
+            "target_keys": [
+                "care_start_date",
+                "care_end_date",
+                "invoice_date",
+                "insured_dob",
+                "beneficiary_dob"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "List all people mentioned",
+                    "Show all names"
+                ],
+                "fr": [
+                    "Lister toutes les personnes mentionnées",
+                    "Afficher tous les noms"
+                ]
+            },
+            "group_name": "all_names",
+            "target_keys": [
+                "doctor_name",
+                "insured_name",
+                "beneficiary_name",
+                "invoice_issuer"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Get only the insured person's info",
+                    "Insured person details"
+                ],
+                "fr": [
+                    "Obtenir uniquement les informations de l'assuré",
+                    "Détails de la personne assurée"
+                ]
+            },
+            "group_name": "insured_only",
+            "target_keys": [
+                "insured_name",
+                "insured_dob",
+                "security_number"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "What is the ADELI number?",
+                    "Show me the ADELI"
+                ],
+                "fr": [
+                    "Quel est le numéro ADELI ?",
+                    "Montrez-moi le numéro ADELI"
+                ]
+            },
+            "group_name": "adeli_number",
+            "target_keys": [
+                "adeli_number"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "What is the currency used?",
+                    "Show the currency"
+                ],
+                "fr": [
+                    "Quelle est la devise utilisée ?",
+                    "Afficher la devise"
+                ]
+            },
+            "group_name": "currency_info",
+            "target_keys": [
+                "currency"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Show service quantity and description",
+                    "What was the quantity of services?"
+                ],
+                "fr": [
+                    "Afficher la quantité et la description du service",
+                    "Quelle était la quantité de services ?"
+                ]
+            },
+            "group_name": "service_quantity",
+            "target_keys": [
+                "items.quantity",
+                "items.description"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Financial reconciliation",
+                    "Show billed vs paid"
+                ],
+                "fr": [
+                    "Rapprochement financier",
+                    "Afficher le facturé par rapport au payé"
+                ]
+            },
+            "group_name": "financial_reconciliation",
+            "target_keys": [
+                "total_billed",
+                "amount_paid",
+                "remaining_payment"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Is the doctor the same as the issuer?",
+                    "Compare doctor and issuer"
+                ],
+                "fr": [
+                    "Le médecin est-il le même que l'émetteur ?",
+                    "Comparer le médecin et l'émetteur"
+                ]
+            },
+            "group_name": "doctor_vs_issuer",
+            "target_keys": [
+                "doctor_name",
+                "invoice_issuer"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Are there any missing professional numbers?",
+                    "Check for null IDs"
+                ],
+                "fr": [
+                    "Y a-t-il des numéros professionnels manquants ?",
+                    "Vérifier les identifiants nuls"
+                ]
+            },
+            "group_name": "missing_ids",
+            "target_keys": [
+                "adeli_number",
+                "rpps_number",
+                "finess_number"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Show patient and service date",
+                    "Who received care and when?"
+                ],
+                "fr": [
+                    "Afficher le patient et la date du service",
+                    "Qui a reçu les soins et quand ?"
+                ]
+            },
+            "group_name": "patient_and_service_date",
+            "target_keys": [
+                "beneficiary_name",
+                "items.date_of_service"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Give me the total amount due",
+                    "What was the total cost?"
+                ],
+                "fr": [
+                    "Donnez-moi le montant total dû",
+                    "Quel était le coût total ?"
+                ]
+            },
+            "group_name": "total_cost",
+            "target_keys": [
+                "total_billed"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Provide the social security number",
+                    "What is the security number?"
+                ],
+                "fr": [
+                    "Fournir le numéro de sécurité sociale",
+                    "Quel est le numéro de sécurité sociale ?"
+                ]
+            },
+            "group_name": "security_number_info",
+            "target_keys": [
+                "security_number"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Dump all data",
+                    "Show me everything"
+                ],
+                "fr": [
+                    "Extraire toutes les données",
+                    "Montre-moi tout"
+                ]
+            },
+            "group_name": "full_dump",
+            "target_keys": [
+                "is_bill",
+                "profession",
+                "adeli_number",
+                "rpps_number",
+                "finess_number",
+                "doctor_name",
+                "total_billed",
+                "bill_paid",
+                "amount_paid",
+                "mandatory_coverage",
+                "out_of_pocket",
+                "remaining_payment",
+                "insured_name",
+                "insured_dob",
+                "beneficiary_name",
+                "beneficiary_dob",
+                "care_start_date",
+                "care_end_date",
+                "invoice_date",
+                "security_number",
+                "invoice_issuer",
+                "currency",
+                "items"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "What is the doctor's specialty?",
+                    "Doctor's profession"
+                ],
+                "fr": [
+                    "Quelle est la spécialité du médecin ?",
+                    "Profession du médecin"
+                ]
+            },
+            "group_name": "doctor_profession",
+            "target_keys": [
+                "profession"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Show invoice date and due amount",
+                    "When was the bill issued and for how much?"
+                ],
+                "fr": [
+                    "Afficher la date de la facture et le montant dû",
+                    "Quand la facture a-t-elle été émise et pour quel montant ?"
+                ]
+            },
+            "group_name": "invoice_date_and_amount",
+            "target_keys": [
+                "invoice_date",
+                "total_billed"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Are there any remaining payments?",
+                    "Is there a balance due?"
+                ],
+                "fr": [
+                    "Y a-t-il des paiements restants ?",
+                    "Y a-t-il un solde dû ?"
+                ]
+            },
+            "group_name": "remaining_balance_check",
+            "target_keys": [
+                "remaining_payment",
+                "bill_paid"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Show all patient-related dates",
+                    "What are the patient's dates?"
+                ],
+                "fr": [
+                    "Afficher toutes les dates relatives au patient",
+                    "Quelles sont les dates du patient ?"
+                ]
+            },
+            "group_name": "patient_dates",
+            "target_keys": [
+                "insured_dob",
+                "beneficiary_dob"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "List service, date, and amount",
+                    "Give me a line item breakdown"
+                ],
+                "fr": [
+                    "Lister le service, la date et le montant",
+                    "Donnez-moi une ventilation par poste"
+                ]
+            },
+            "group_name": "line_item_summary",
+            "target_keys": [
+                "items.description",
+                "items.date_of_service",
+                "items.amount"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Who is the insured person?",
+                    "Get insured party details"
+                ],
+                "fr": [
+                    "Qui est l'assuré ?",
+                    "Obtenir les détails de la partie assurée"
+                ]
+            },
+            "group_name": "insured_person_details",
+            "target_keys": [
+                "insured_name",
+                "insured_dob"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "What was the amount paid?",
+                    "How much has been paid?"
+                ],
+                "fr": [
+                    "Quel était le montant payé ?",
+                    "Combien a été payé ?"
+                ]
+            },
+            "group_name": "amount_paid_info",
+            "target_keys": [
+                "amount_paid"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Check for RPPS number",
+                    "Is there an RPPS number?"
+                ],
+                "fr": [
+                    "Vérifier le numéro RPPS",
+                    "Y a-t-il un numéro RPPS ?"
+                ]
+            },
+            "group_name": "rpps_check",
+            "target_keys": [
+                "rpps_number"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Check for FINESS number",
+                    "Is there a FINESS number?"
+                ],
+                "fr": [
+                    "Vérifier le numéro FINESS",
+                    "Y a-t-il un numéro FINESS ?"
+                ]
+            },
+            "group_name": "finess_check",
+            "target_keys": [
+                "finess_number"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Show all billing identifiers",
+                    "List all bill IDs"
+                ],
+                "fr": [
+                    "Afficher tous les identifiants de facturation",
+                    "Lister tous les ID de facture"
+                ]
+            },
+            "group_name": "billing_identifiers",
+            "target_keys": [
+                "invoice_date",
+                "security_number",
+                "adeli_number"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Was there mandatory coverage applied?",
+                    "Check mandatory coverage"
+                ],
+                "fr": [
+                    "Une couverture obligatoire a-t-elle été appliquée ?",
+                    "Vérifier la couverture obligatoire"
+                ]
+            },
+            "group_name": "mandatory_coverage_check",
+            "target_keys": [
+                "mandatory_coverage"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "How much was out of pocket?",
+                    "Check out-of-pocket expense"
+                ],
+                "fr": [
+                    "Combien était le reste à charge ?",
+                    "Vérifier le reste à charge"
+                ]
+            },
+            "group_name": "out_of_pocket_check",
+            "target_keys": [
+                "out_of_pocket"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Show patient name and doctor name",
+                    "Who saw whom?"
+                ],
+                "fr": [
+                    "Afficher le nom du patient et le nom du médecin",
+                    "Qui a vu qui ?"
+                ]
+            },
+            "group_name": "patient_doctor_pair",
+            "target_keys": [
+                "beneficiary_name",
+                "doctor_name"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Get service item coverage",
+                    "Was the line item covered?"
+                ],
+                "fr": [
+                    "Obtenir la couverture de l'acte",
+                    "L'acte était-il couvert ?"
+                ]
+            },
+            "group_name": "service_item_coverage",
+            "target_keys": [
+                "items.mandatory_coverage",
+                "items.description"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Show full patient and beneficiary info",
+                    "Compare insured vs beneficiary"
+                ],
+                "fr": [
+                    "Afficher les informations complètes du patient et du bénéficiaire",
+                    "Comparer l'assuré et le bénéficiaire"
+                ]
+            },
+            "group_name": "insured_vs_beneficiary",
+            "target_keys": [
+                "insured_name",
+                "insured_dob",
+                "beneficiary_name",
+                "beneficiary_dob"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "What is the invoice date?",
+                    "When was the bill created?"
+                ],
+                "fr": [
+                    "Quelle est la date de la facture ?",
+                    "Quand la facture a-t-elle été créée ?"
+                ]
+            },
+            "group_name": "invoice_date_info",
+            "target_keys": [
+                "invoice_date"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Get all provider information",
+                    "Show all details for the doctor"
+                ],
+                "fr": [
+                    "Obtenir toutes les informations sur le prestataire",
+                    "Afficher tous les détails pour le médecin"
+                ]
+            },
+            "group_name": "full_provider_info",
+            "target_keys": [
+                "doctor_name",
+                "profession",
+                "adeli_number",
+                "rpps_number",
+                "finess_number",
+                "invoice_issuer"
+            ]
+        },
+        {
+            "prompts": {
+                "en": [
+                    "Show me the service description only",
+                    "What was the service?"
+                ],
+                "fr": [
+                    "Montrez-moi uniquement la description du service",
+                    "Quel était le service ?"
+                ]
+            },
+            "group_name": "service_description_only",
+            "target_keys": [
+                "items.description"
+            ]
+        }
+    ]
+}
--- a/easydistill/mmkd/train_lora.py
+++ b/easydistill/mmkd/train_lora.py
@@ -0,0 +1,289 @@
+# Copyright 2024 Alibaba Group Holding Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import json
+import torch
+import numpy as np
+import jsonlines
+import torch.nn.functional as F
+import os
+import argparse
+import logging
+from datasets import load_dataset, Dataset
+from typing import Optional, Dict, Union, List
+from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
+from transformers import (
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainingArguments,
+)
+from qwen_vl_utils import process_vision_info
+from trl import SFTTrainer, SFTConfig
+
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+from torch.utils.data import Dataset
+from PIL import Image
+import os
+
+
+class MMDataset(Dataset):
+    def __init__(self, data):
+        self.data = data
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return self.data[int(idx)]
+
+
+class DistillSFTTrainer(SFTTrainer):
+
+    def __init__(
+        self,
+        logits_dir: str = None,
+        teacher_vocab_size=None,
+        kd_ratio: float = 0.5,
+        max_seq_length: int = 1024,
+        distillation_type: str = "forward_kld",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.logits_dir = logits_dir
+        self.teacher_vocab_size = teacher_vocab_size
+        self.kd_ratio = kd_ratio
+        self.max_seq_length = max_seq_length
+        self.distillation_type = distillation_type
+        self.teacher_logits = []
+        with jsonlines.open(self.logits_dir) as reader:
+            for obj in reader:
+                self.teacher_logits.append(obj)
+
+    def _load_teacher_logits(
+        self,
+        batch_size: int,
+        it: int,
+        dp_rank: int,
+        device: torch.device,
+        no_model_batch: Dict,
+    ):
+        start_idx = dp_rank * batch_size + batch_size * it
+        end_idx = dp_rank * batch_size + batch_size * (it + 1)
+        loaded_data = self.teacher_logits[start_idx:end_idx]
+        arr = np.zeros((batch_size, self.max_seq_length, self.teacher_vocab_size))
+        for i in range(len(loaded_data)):
+            for j in range(len(loaded_data[i])):
+                keys = np.array(list(loaded_data[i][j].keys()), dtype=int)
+                values = np.array(list(loaded_data[i][j].values()))
+                arr[i, j, keys] = values
+
+        logits_tensor = torch.tensor(arr, dtype=torch.bfloat16, device=device)
+        return self._shift_tensor_right(
+            logits_tensor, no_model_batch["label"], pad_value=0
+        )
+
+    def _compute_white_box_distillation_loss(
+        self,
+        student_logits: torch.Tensor,
+        teacher_logits: torch.Tensor,
+        labels: Optional[torch.Tensor],
+    ):
+        student_logits = student_logits[:, : self.max_seq_length, :]
+        teacher_probs = teacher_logits[
+            :, : student_logits.size(1), : student_logits.size(-1)
+        ]
+        mask = (
+            (labels != -100).float()
+            if labels is not None
+            else torch.ones_like(student_logits[:, :, 0])
+        )
+
+        if self.distillation_type == "forward_kld":
+            # Forward KLD: student learns from teacher (original implementation)
+            loss = F.kl_div(
+                F.log_softmax(student_logits, dim=-1),
+                teacher_probs,
+                reduction="none",
+                log_target=False,
+            ).sum(dim=-1) / torch.sum(mask.view(-1), dim=0)
+        elif self.distillation_type == "reverse_kld":
+            # Reverse KLD: teacher provides certainty to student
+            loss = F.kl_div(
+                torch.log(teacher_probs.clamp(min=1e-10)),  # avoid log(0)
+                F.softmax(student_logits, dim=-1),
+                reduction="none",
+                log_target=False,
+            ).sum(dim=-1) / torch.sum(mask.view(-1), dim=0)
+        else:
+            raise ValueError(
+                f"Unsupported distillation type: {self.distillation_type}. Use 'forward_kld' or 'reverse_kld'"
+            )
+
+        return (loss * mask).sum() / mask.sum()
+
+    @staticmethod
+    def _shift_tensor_right(
+        inputs: torch.Tensor, labels: torch.Tensor, pad_value: float = 0.0
+    ):
+        batch_size, seqlen, vocab_size = inputs.shape
+        device = inputs.device
+        labels_ne = labels != -100
+        shift_distances = torch.argmax(labels_ne.int(), dim=1)
+        idx = (
+            torch.arange(seqlen, device=device).unsqueeze(0).expand(batch_size, seqlen)
+        )
+        shifted_idx = idx - shift_distances.unsqueeze(1)
+        mask = shifted_idx >= 0
+        shifted_idx = shifted_idx.clamp(min=0)
+        inputs_flat = inputs.view(batch_size, seqlen, vocab_size)
+        shifted_idx = shifted_idx.unsqueeze(2).expand(-1, -1, vocab_size)
+        gathered = torch.gather(inputs_flat, 1, shifted_idx)
+        mask = mask.unsqueeze(2).expand(-1, -1, vocab_size)
+        return torch.where(mask, gathered, torch.full_like(gathered, pad_value))
+
+    def compute_loss(
+        self,
+        model: PreTrainedModel,
+        inputs: Dict[str, torch.Tensor],
+        return_outputs=False,
+        num_items_in_batch=None,
+    ):
+        outputs = model(**inputs)
+        lm_loss = outputs.loss
+        if self.logits_dir:
+            teacher_logits = self._load_teacher_logits(
+                batch_size=inputs["input_ids"].size(0),
+                it=self.state.global_step,
+                dp_rank=(
+                    torch.distributed.get_rank()
+                    if torch.distributed.is_initialized()
+                    else 0
+                ),
+                device=model.device,
+                no_model_batch={"label": inputs.get("labels", None)},
+            )
+            distil_loss = self._compute_white_box_distillation_loss(
+                student_logits=outputs.logits,
+                teacher_logits=teacher_logits,
+                labels=inputs.get("labels", None),
+            )
+            total_loss = (1 - self.kd_ratio) * lm_loss + self.kd_ratio * distil_loss
+        else:
+            total_loss = lm_loss
+        return (total_loss, outputs) if return_outputs else total_loss
+
+
+def train(config):
+    with open(config["dataset"]["labeled_path"], "r") as f:
+        raw_data = json.load(f)
+    dataset = MMDataset(raw_data)
+    student_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        config["models"]["student"], trust_remote_code=True
+    )
+    processor = Qwen2_5_VLProcessor.from_pretrained(config["models"]["student"])
+
+    training_arguments = SFTConfig(**config["training"])
+    training_arguments.gradient_checkpointing_kwargs = dict(use_reentrant=False)
+    training_arguments.remove_unused_columns = False
+    training_arguments.dataset_kwargs = {"skip_prepare_dataset": True}
+
+    def collate_fn(examples):
+        texts = []
+        images = []
+        for example in examples:
+
+            chat = example
+            text = processor.apply_chat_template(chat, tokenize=False)
+            texts.append(text)
+
+            image, _ = process_vision_info(example)
+            images.append(image)
+
+        batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
+        labels = batch["input_ids"].clone()
+        labels[labels == processor.tokenizer.pad_token_id] = -100
+
+        if isinstance(processor, Qwen2_5_VLProcessor):
+            image_tokens = [151652, 151653, 151655]
+        else:
+            image_tokens = [
+                processor.tokenizer.convert_tokens_to_ids(processor.image_token)
+            ]
+
+        for image_token_id in image_tokens:
+            labels[labels == image_token_id] = -100
+        batch["labels"] = labels
+        return batch
+
+    try:
+        job_type = config["job_type"]
+        if "mmkd_black_box" in job_type:
+
+            trainer = SFTTrainer(
+                model=student_model,
+                data_collator=collate_fn,
+                processing_class=processor.tokenizer,
+                args=training_arguments,
+                train_dataset=dataset,
+            )
+        elif "mmkd_white_box" in job_type:
+            teacher_vocab_size = json.load(
+                open(os.path.join(config["models"]["teacher"], "config.json"))
+            )["vocab_size"]
+            trainer = DistillSFTTrainer(
+                logits_dir=config["dataset"]["logits_path"],
+                data_collator=collate_fn,
+                teacher_vocab_size=teacher_vocab_size,
+                kd_ratio=config["distillation"]["kd_ratio"],
+                max_seq_length=config["distillation"]["max_seq_length"],
+                distillation_type=config["distillation"].get(
+                    "distillation_type", "forward_kld"
+                ),
+                model=student_model,
+                processing_class=processor.tokenizer,
+                args=training_arguments,
+                train_dataset=dataset,
+            )
+        else:
+            logging.error(f"Invalid job type: {job_type}")
+            raise ValueError(f"Invalid job type: {job_type}")
+    except ValueError as e:
+        logging.error(f"Training job terminated: {e}")
+        return
+
+    trainer.train()
+    trainer.save_model(config["training"]["output_dir"])
+    processor.tokenizer.save_pretrained(config["training"]["output_dir"])
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", type=str, required=True, help="path to the json config file"
+    )
+    args = parser.parse_args()
+    config = json.load(open(args.config))
+    train(config)
+
+
+if __name__ == "__main__":
+    main()
--- a/easydistill/mmkd/vqa.json
+++ b/easydistill/mmkd/vqa.json