modify gen_vqa_bank to randomly select ratio number of fields to ask

modify gen_vqa_bank
2025-08-08 14:20:33 +00:00 · 2025-08-07 15:45:55 +00:00
4 changed files with 181 additions and 10 deletions
--- a/easydistill/mmkd/dev-vqa/gen_vqa_bank.py
+++ b/easydistill/mmkd/dev-vqa/gen_vqa_bank.py
@@ -7,7 +7,7 @@ import re

 def load_json(filepath):
    """
-    Loads a JSON file with robust error handling.
+    Loads a JSON file .
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
@@ -21,7 +21,7 @@ def load_json(filepath):

 def read_text_file(filepath):
    """
-    Loads a simple text file.
+    Loads a prompt from a text file.
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
@@ -32,7 +32,7 @@ def read_text_file(filepath):

 def format_items_list(items, language):
    """
-    Formats a list of item dictionaries into a human-readable string.
+    Formats a list of item dictionaries (services) into a human-readable string.
    """
    if not items:
        return ""
@@ -92,7 +92,7 @@ def get_conversational_answer(field, label_data, answer_bank, language):
    return str(value) if value is not None else ""

 # --- Conversations Generation for Label Data ---
-def generate_field_level_conversations(labels_path, image_root, system_prompt_path, questions_path, answers_path, output_path):
+def generate_vqa_conversations(labels_path, image_root, system_prompt_path, questions_path, answers_path, output_path, ratio=0.4):
    """
    Generates multiple conversational VQA pairs for each field in a label file,
    and handles multi-page documents.
@@ -117,6 +117,14 @@ def generate_field_level_conversations(labels_path, image_root, system_prompt_pa
        if not label_data or not image_filename_prefix:
            continue

+        
+        # Get a list of all fields in the label data
+        all_fields = [field for field in label_data if field in question_bank]
+        # Determine how many questions to ask based on the available fields
+        num_to_sample = max(1, int(len(all_fields) * ratio))
+        # Randomly select fields to ask questions about
+        fields_to_ask = random.sample(all_fields, num_to_sample)
+        
        # Find all image files in the image_root that start with the prefix.
        # This handles cases like 'doc-1.jpg', 'doc-2.jpg', 'doc_scale.jpg' etc.
        prefix_stem = Path(image_filename_prefix).stem
@@ -131,7 +139,7 @@ def generate_field_level_conversations(labels_path, image_root, system_prompt_pa
        image_content_list = [{"type": "image", "image": path} for path in found_image_paths]

        # --- Create a new conversation for EACH field in the label ---
-        for field in label_data:
+        for field in fields_to_ask:
            if not isinstance(field, str):
                continue
            if field not in question_bank:
@@ -173,10 +181,9 @@ def generate_field_level_conversations(labels_path, image_root, system_prompt_pa
    print(f"Formatted data saved to: {output_path}")

 # --- Conversations Generation for only Images ---
-def generate_image_only_conversations(image_root, system_prompt_path, questions_path, output_path):
+def generate_vq_question(image_root, system_prompt_path, questions_path, output_path, ratio=0.4):
    """
    Generates conversational VQA pairs for each document based on images only (no labels).
-    Groups all images with the same prefix (including _1_scale, _2_scale, etc.) into the same conversation.
    Each conversation contains a system and user message for each question in the question bank.
    """
    system_prompt = read_text_file(system_prompt_path)
@@ -197,11 +204,21 @@ def generate_image_only_conversations(image_root, system_prompt_path, questions_
        prefix = re.sub(r'(_\d+(_scale)?)$', '', stem)
        prefix_to_images.setdefault(prefix, []).append(path)

+    # Get a list of all possible fields from the question bank.
+    all_fields = list(question_bank.keys())
+    # Determine how many questions to ask based on the available fields
+    num_to_sample = max(1, int(len(all_fields) * ratio))
+
    final_conversations = []

    for prefix, image_paths in prefix_to_images.items():
        image_content_list = [{"type": "image", "image": path} for path in sorted(image_paths)]
-        for field, lang_dict in question_bank.items():
+        
+        # Randomly select fields to ask questions about
+        fields_to_ask = random.sample(all_fields, num_to_sample)
+        
+        for field in fields_to_ask:
+            lang_dict = question_bank[field]
            for language in lang_dict:
                for question_text in lang_dict[language]:
                    system_message = {
@@ -228,10 +245,13 @@ if __name__ == "__main__":
    IMAGE_ROOT = '/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_1'
    LABELS_FILE = os.path.join(IMAGE_ROOT, 'label_data.json')
    SYSTEM_PROMPT_FILE = '/home/nguyendc/phong-dev/distill/prompt/system_prompt.txt'
+    UNSTRUCTURED_PROMPT_FILE = '/home/nguyendc/phong-dev/distillation/easydistill/mmkd/dev-vqa/qa_bank/unstructured_prompt.txt'
    QUESTION_BANK_FILE = '/home/nguyendc/phong-dev/distill/prompt/question_bank.json'
    ANSWER_BANK_FILE = '/home/nguyendc/phong-dev/distill/prompt/answer_bank.json'
    OUTPUT_FILE = os.path.join(IMAGE_ROOT, 'vqa_nolabel.json')
+    QUESTION_RATIO = 0.5 
+
    
    # Run the main generation function
-    # generate_field_level_conversations(LABELS_FILE, IMAGE_ROOT, SYSTEM_PROMPT_FILE, QUESTION_BANK_FILE, ANSWER_BANK_FILE, OUTPUT_FILE)
-    generate_image_only_conversations(IMAGE_ROOT, SYSTEM_PROMPT_FILE, QUESTION_BANK_FILE, OUTPUT_FILE)
+    # generate_vqa_conversations(LABELS_FILE, IMAGE_ROOT, UNSTRUCTURED_PROMPT_FILE, QUESTION_BANK_FILE, ANSWER_BANK_FILE, OUTPUT_FILE)
+    generate_vq_question(IMAGE_ROOT, UNSTRUCTURED_PROMPT_FILE, QUESTION_BANK_FILE, OUTPUT_FILE)
--- a/easydistill/mmkd/dev-vqa/qa_bank/system_prompt.txt
+++ b/easydistill/mmkd/dev-vqa/qa_bank/system_prompt.txt
@@ -0,0 +1,19 @@
+You are an advanced AI agent created by Rizlum AI. You are designed to extract structured information from health invoices with high accuracy. Your task is to parse invoices and return only the requested fields in a strict JSON format.
+
+### **General Instructions**
+1. **Extract Only the Specified Fields**: Do not include extra information.
+2. **Do Not Guess or hallucinate if information is missing or represented by placeholders (e.g., dots, dashes).**
+3. **Ignore irrelevant fields (e.g., address, SIRET, membership numbers).**.
+4. **Ensure Strictly Valid JSON Output**: Do not return additional text or explanations.
+5. **Field Relationship Guidance**: Formula: total_bill = mandatory_coverage + complementary_coverage + client_part. Instruction: Prioritize extracting all values directly and only if they appear on the invoice. This formula is a guide to verify the consistency of extracted numbers, not a command to calculate a missing total_bill
+
+### **Handling Ambiguous Cases**
+- **Adeli Number**: If a 9-digit number appears without the keyword 'Adeli', check if it matches the Adeli number format and is associated with a recognized healthcare professional.
+- **Doctor Selection**:
+   - If the invoice shows multiple doctors, exclude any doctor that is visibly crossed out.
+   - Prioritize doctor information (e.g., name, Adeli, RPPS) within a stamp (identified by visual stamp features like borders or official markings) over unstamped doctor blocks. Exclude unstamped doctor information if a stamped block exists.
+- **Item Selection in Tables**:
+  - If multiple items or acts are listed, extract only those that are highlighted (e.g., marked with color).
+  - Ignore all other items that are not explicitly marked or priced.
+- **Date**:
+  - Distinguish carefully between similar characters: treat '/1' as '1' (e.g., January), not '11' (e.g., November), by focusing on stroke separation and context rather than assuming a slash implies a specific number.
--- a/easydistill/mmkd/dev-vqa/qa_bank/unstructured_prompt.txt
+++ b/easydistill/mmkd/dev-vqa/qa_bank/unstructured_prompt.txt
@@ -0,0 +1,5 @@
+You are an advanced AI agent created by Rizlum AI. You are designed to extract structured information from health invoices with high accuracy. Your task is to parse invoices and return only the requested fields.
+
+### **General Instructions**
+1. **Extract Only the Specified Fields**: Do not include extra information.
+2. **Do Not Guess or hallucinate if information is missing or represented by placeholders (e.g., dots, dashes).**
--- a/easydistill/mmkd/dev-vqa/qa_bank/user_prompt.txt
+++ b/easydistill/mmkd/dev-vqa/qa_bank/user_prompt.txt
@@ -0,0 +1,127 @@
+Extract the following structured information from the provided invoice. Fill in only existing values.
+Strictly return a valid JSON following this schema:
+
+**Json schema**
+{
+  "type": "object ",
+  "properties": {
+    "is_bill": {
+      "type": "boolean",
+      "description": "True if the document is an invoice, false otherwise."
+    },
+    "profession": {
+      "type": ["string", "null"],
+      "description": "Type of healthcare profession, if it is presented in the list [Optique, Kinésiologie, Kinésithérapie, Pharmacie, Biologie, Psychologie, Infirmier, Ostéopathie, Dentaire, Sage-femme, Sophrologie, Soins hospitaliers, Orthopédie, Podologie, Diététique, Radiologie, Orthophonie, Pédiatrie, Assurance Maladie, Pompes funèbres, Laboratoire, Gynécologie-obstétrique, Chiropractie, Psychomotricité, Ostéodensitométrie, Pneumologie, Vaccins, Sevrage tabagique, Contraception, Homéopathie, Acupunture], Unknown otherwise."
+    },
+    "adeli_number": {
+      "type": ["string", "null"],
+      "description": "Adeli number (9-digit identifier) associated with the healthcare provider"
+    },
+    "rpps_number": {
+      "type": ["string", "null"],
+      "description": "11 digits identifier, indicated after the term 'RPPS'"
+    },
+    "finess_number": {
+      "type": ["string", "null"],
+      "description": "9 digits identifier, indicated after one of the terms in list ['finess', 'identifiant CPAM']"
+    },
+    "doctor_name": {
+      "type": ["string", "null"],
+      "description": "Full name of the doctor"
+    },
+    "prescripteur_finess_number": {
+      "type": ["string", "null"],
+      "description": "Finess number of the prescriber in the invoice (9 digits identifier, indicated after the term 'finess')"
+    },
+    "total_billed": {
+      "type": ["number", "null"],
+      "description": "The total amount billed on the invoice"
+    },
+    "bill_paid": {
+      "type": "boolean",
+      "description": "True if the invoice has been paid, false otherwise (Look for terms like: 'acquittée', 'payée', 'quittance', 'réglée', 'certifie avoir reçu le règlement')"
+    },
+    "amount_paid": {
+      "type": ["number", "null"],
+      "description": "The amount paid for the invoice"
+    },
+    "mandatory_coverage": {
+      "type": ["number", "null"],
+      "description": "Amount covered by compulsory health insurance (indicated after terms like 'AMO', 'Rbmt RO', 'CAISSE', 'Noemie', etc.)"
+    },
+    "complementary_coverage": {
+      "type": ["number", "null"],
+      "description": "Amount covered by complementary insurance (indicated after terms like 'AMC', 'RC', 'Mutuelle')"
+    },
+    "client_part": {
+      "type": ["number", "null"],
+      "description": "Amount paid by client (indicated after terms like 'ASSURE', 'Part Client', 'Part Assuré')"
+    },
+    "remaining_payment": {
+      "type": ["number", "null"],
+      "description": "The remaining balance to be paid by the beneficiary if the invoice is unpaid."
+    },
+    "insured_name": {
+      "type": ["string", "null"],
+      "description": "Full name of the insured person (indicated after terms like 'Assure')"
+    },
+    "insured_dob": {
+      "type": ["string", "null"],
+      "description": "Date of birth of the insured person (format: dd-mm-yyyy)"
+    },
+    "beneficiary_name": {
+      "type": ["string", "null"],
+      "description": "Full name of the invoice beneficiary"
+    },
+    "beneficiary_dob": {
+      "type": ["string", "null"],
+      "description": "Date of birth of the beneficiary (format: dd-mm-yyyy)"
+    },
+    "invoice_date": {
+      "type": ["string", "null"],
+      "description": "Date of the invoice (format: dd-mm-yyyy)"
+    },
+    "security_number": {
+      "type": ["string", "null"],
+      "description": "Social Security number (13 or 15 digit identifier, indicated after terms like 'Sécurité Social' ou 'N° INSEE' ou 'N° SS')"
+    },
+    "invoice_issuer": {
+      "type": ["string", "null"],
+      "description": "Name or organization issuing the invoice or providing the service"
+    },
+    "currency": {
+      "type": ["string", "null"],
+      "description": "Currency used (e.g., EUR, USD)"
+    },
+    "items": {
+      "type": "array",
+      "description": "List of items or services included in the invoice.",
+      "items": {
+        "type": "object",
+        "properties": {
+          "description": {
+            "type": ["string", "null"],
+            "description": "Description of the item or service."
+          },
+          "quantity": {
+            "type": ["number", "null"],
+            "description": "Quantity of the item or service."
+          },
+          "date_of_service": {
+            "type": ["string", "null"],
+            "description": "Date of service (when the item was provided), in format dd-mm-yyyy."
+          },
+          "mandatory_coverage": {
+            "type": ["number", "null"],
+            "description": "Amount covered by mandatory health insurance for this item."
+          },
+          "amount": {
+            "type": ["number", "null"],
+            "description": "Total amount for the item (unit price * quantity)."
+          }
+        }
+      }
+    }
+  }
+}
+
Author	SHA1	Message	Date
lphatnguyen	3b43f89df5	modify gen_vqa_bank to randomly select ratio number of fields to ask	2025-08-08 14:20:33 +00:00
lphatnguyen	bbefb444a9	modify gen_vqa_bank	2025-08-07 15:45:55 +00:00