[Init] Init easy distill for Knowledge distillation

2025-08-07 08:38:26 +00:00
parent 2f21aaae17
commit 0637599c3a
19 changed files with 170614 additions and 3 deletions
--- a/easydistill/mmkd/dev-vqa/gen_vqa_bank.py
+++ b/easydistill/mmkd/dev-vqa/gen_vqa_bank.py
@@ -0,0 +1,237 @@
+import json
+import os
+import random
+from pathlib import Path
+import glob
+import re
+
+def load_json(filepath):
+    """
+    Loads a JSON file with robust error handling.
+    """
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"Error: The file was not found at {filepath}")
+        return None
+    except json.JSONDecodeError as e:
+        print(f"Error: The file at {filepath} is not a valid JSON file. Details: {e}")
+        return None
+
+def read_text_file(filepath):
+    """
+    Loads a simple text file.
+    """
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            return f.read().strip()
+    except FileNotFoundError:
+        print(f"Error: The file was not found at {filepath}")
+        return None
+
+def format_items_list(items, language):
+    """
+    Formats a list of item dictionaries into a human-readable string.
+    """
+    if not items:
+        return ""
+
+    formatted_lines = []
+    for item in items:
+        if not isinstance(item, dict):
+            continue
+        parts = []
+        desc = item.get("description")
+        if desc is not None:
+            parts.append(f"{desc}")
+        qty = item.get("quantity")
+        if qty is not None:
+            qty_str = "Quantity" if language == "english" else "Quantité"
+            parts.append(f"{qty_str}: {qty}")
+        date = item.get("date_of_service")
+        if date is not None:
+            date_str = "Date" if language == "english" else "Date"
+            parts.append(f"{date_str}: {date}")
+        mandatory = item.get("mandatory_coverage")
+        if mandatory is not None:
+            amo_str = "Mandatory Coverage" if language == "english" else "Couverture obligatoire"
+            parts.append(f"{amo_str}: {mandatory}")
+        amount = item.get("amount")
+        if amount is not None:
+            amount_str = "Amount" if language == "english" else "Montant"
+            parts.append(f"{amount_str}: {amount}")
+        formatted_lines.append("- " + ", ".join(parts))
+    return "\n".join(formatted_lines)
+
+def get_conversational_answer(field, label_data, answer_bank, language):
+    """
+    Generates a complete conversational answer by selecting a template and filling it
+    with the appropriate value from the label data.
+    """
+    value = label_data.get(field)
+    field_templates = answer_bank.get(field)
+
+    if not field_templates:
+        return str(value) if value is not None else ""
+
+    if value is None:
+        return random.choice(field_templates.get("null", {}).get(language, [""]))
+    if field == "items":
+        template = random.choice(field_templates[language])
+        formatted_list_string = format_items_list(value, language)
+        return template.format(value=formatted_list_string)
+    if isinstance(value, bool):
+        bool_key = str(value).lower()
+        if bool_key in field_templates[language]:
+            return random.choice(field_templates[language][bool_key])
+        return str(value)
+    if isinstance(field_templates[language], list):
+        template = random.choice(field_templates[language])
+        return template.format(value=value)
+    return str(value) if value is not None else ""
+
+# --- Conversations Generation for Label Data ---
+def generate_field_level_conversations(labels_path, image_root, system_prompt_path, questions_path, answers_path, output_path):
+    """
+    Generates multiple conversational VQA pairs for each field in a label file,
+    and handles multi-page documents.
+    """
+    all_data_entries = load_json(labels_path)
+    system_prompt = read_text_file(system_prompt_path)
+    question_bank = load_json(questions_path)
+    answer_bank = load_json(answers_path)
+
+    if not all_data_entries or not system_prompt or not question_bank or not answer_bank:
+        print("Could not load one or more necessary files. Exiting.")
+        return
+
+    final_conversations = []
+
+    # Process each entry in the main label file
+    for entry in all_data_entries:
+        label_data = entry.get("label")
+        image_filename_prefix = entry.get("image")
+
+        # Skip entries that are unlabeled, as we need the label to generate Q&A pairs
+        if not label_data or not image_filename_prefix:
+            continue
+
+        # Find all image files in the image_root that start with the prefix.
+        # This handles cases like 'doc-1.jpg', 'doc-2.jpg', 'doc_scale.jpg' etc.
+        prefix_stem = Path(image_filename_prefix).stem
+        search_pattern = os.path.join(image_root, f"{prefix_stem}*")
+        found_image_paths = sorted(glob.glob(search_pattern))
+
+        if not found_image_paths:
+            print(f"Warning: No images found for prefix '{prefix_stem}' in '{image_root}'. Skipping.")
+            continue
+            
+        # Create a list of image dictionaries for the user message
+        image_content_list = [{"type": "image", "image": path} for path in found_image_paths]
+
+        # --- Create a new conversation for EACH field in the label ---
+        for field in label_data:
+            if not isinstance(field, str):
+                continue
+            if field not in question_bank:
+                continue
+
+            language = random.choice(['english', 'french'])
+            
+            # Get the question from the question bank
+            question_text = random.choice(question_bank[field][language])
+            
+            # Get the conversational answer from the answer bank
+            answer_text = get_conversational_answer(field, label_data, answer_bank, language)
+
+            # --- Assemble the conversation in the desired format ---
+            system_message = {
+                "role": "system",
+                "content": system_prompt
+            }
+
+            user_message = {
+                "role": "user",
+                # The content is the list of image dicts, followed by the text dict
+                "content": image_content_list + [{"type": "text", "text": "<image>"+ question_text}]
+            }
+            
+            assistant_message = {
+                "role": "assistant",
+                "content": answer_text
+            }
+            
+            conversation = [system_message, user_message, assistant_message]
+            final_conversations.append(conversation)
+
+    # Save the final list of conversations to the output file
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(final_conversations, f, indent=4, ensure_ascii=False)
+
+    print(f"Success! Generated {len(final_conversations)} conversational VQA entries.")
+    print(f"Formatted data saved to: {output_path}")
+
+# --- Conversations Generation for only Images ---
+def generate_image_only_conversations(image_root, system_prompt_path, questions_path, output_path):
+    """
+    Generates conversational VQA pairs for each document based on images only (no labels).
+    Groups all images with the same prefix (including _1_scale, _2_scale, etc.) into the same conversation.
+    Each conversation contains a system and user message for each question in the question bank.
+    """
+    system_prompt = read_text_file(system_prompt_path)
+    question_bank = load_json(questions_path)
+
+    if not system_prompt or not question_bank:
+        print("Could not load one or more necessary files. Exiting.")
+        return
+
+    # Find all images and group by prefix
+    all_image_paths = sorted(glob.glob(os.path.join(image_root, "*")))
+    prefix_to_images = {}
+    for path in all_image_paths:
+        if not os.path.isfile(path):
+            continue
+        stem = Path(path).stem
+        # Remove suffixes like _1_scale, _2_scale, etc.
+        prefix = re.sub(r'(_\d+(_scale)?)$', '', stem)
+        prefix_to_images.setdefault(prefix, []).append(path)
+
+    final_conversations = []
+
+    for prefix, image_paths in prefix_to_images.items():
+        image_content_list = [{"type": "image", "image": path} for path in sorted(image_paths)]
+        for field, lang_dict in question_bank.items():
+            for language in lang_dict:
+                for question_text in lang_dict[language]:
+                    system_message = {
+                        "role": "system",
+                        "content": system_prompt
+                    }
+                    user_message = {
+                        "role": "user",
+                        "content": image_content_list + [{"type": "text", "text": "<image>" + question_text}]
+                    }
+                    conversation = [system_message, user_message]
+                    final_conversations.append(conversation)
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(final_conversations, f, indent=4, ensure_ascii=False)
+
+    print(f"Success! Generated {len(final_conversations)} image-only conversational VQA entries.")
+    print(f"Formatted data saved to: {output_path}")
+    
+# --- Main Execution Block ---
+if __name__ == "__main__":
+    
+    # Define file paths
+    IMAGE_ROOT = '/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_1'
+    LABELS_FILE = os.path.join(IMAGE_ROOT, 'label_data.json')
+    SYSTEM_PROMPT_FILE = '/home/nguyendc/phong-dev/distill/prompt/system_prompt.txt'
+    QUESTION_BANK_FILE = '/home/nguyendc/phong-dev/distill/prompt/question_bank.json'
+    ANSWER_BANK_FILE = '/home/nguyendc/phong-dev/distill/prompt/answer_bank.json'
+    OUTPUT_FILE = os.path.join(IMAGE_ROOT, 'vqa_nolabel.json')
+    
+    # Run the main generation function
+    # generate_field_level_conversations(LABELS_FILE, IMAGE_ROOT, SYSTEM_PROMPT_FILE, QUESTION_BANK_FILE, ANSWER_BANK_FILE, OUTPUT_FILE)
+    generate_image_only_conversations(IMAGE_ROOT, SYSTEM_PROMPT_FILE, QUESTION_BANK_FILE, OUTPUT_FILE)