[Fix] prompt templates

[Add] Subfield VQA Generation
2025-08-08 22:16:45 +00:00 · 2025-08-08 22:16:14 +00:00
4 changed files with 624 additions and 649 deletions
--- a/easydistill/mmkd/create_question_answering_pairs.py
+++ b/easydistill/mmkd/create_question_answering_pairs.py
@@ -1,12 +1,14 @@
 import json
-import re
+import numpy as np
+import argparse
+import os


 def load_prompt_templates(filepath):
    """Loads the prompt templates from a JSON file."""
    try:
        with open(filepath, "r", encoding="utf-8") as f:
-            return json.load(f)
+            return json.load(f)["templates"]
    except FileNotFoundError:
        print(f"Error: The file {filepath} was not found.")
        return None
@@ -78,44 +80,131 @@ def get_label_from_prompt(question, data, templates):
    return {"error": "No matching prompt found."}


+def match_question_to_template(
+    templates: str,
+    language: str,
+    system_prompt: str,
+    json_schema: dict,
+    label: dict,
+    media_dir: str,
+):
+    # Preparing system prompt
+    conversations = [{"role": "system", "content": system_prompt}]
+
+    # Preparing user prompt
+    # Select randomly from the template list
+    template = np.random.choice(templates)
+
+    selected_field_list = template["target_keys"]
+    # select field from json_schema
+    prompt_object = {}
+    for field in selected_field_list:
+        prompt_object[field] = json_schema["properties"][field]
+    prompt_object_string = json.dumps(prompt_object, indent=4)
+
+    user_question = f"""Extract the following structured information from the provided invoice. Fill in only existing values.
+Strictly return a valid JSON following this schema:
+
+**Json schema**
+{prompt_object_string}
+"""
+    fns = os.listdir(media_dir)
+    image_paths = []
+    if "image" in label:
+        image_substring = label["image"]
+        for fn in fns:
+            if image_substring in fn:
+                image_paths.append(media_dir + fn)
+    elif "image_files" in label:
+        for image_path in label["image_files"]:
+            if os.path.exists(media_dir + image_path):
+                image_paths.append(media_dir + image_path)
+            else:
+                return None
+    else:
+        return None
+
+    image_contents = [
+        {"type": "image", "image": image_path} for image_path in image_paths
+    ]
+    user_contents = image_contents + [
+        {"type": "text", "text": "<image>" * len(image_contents) + user_question},
+    ]
+    user_object = {"role": "user", "content": user_contents}
+    conversations.append(user_object)
+
+    # Preparing assistant output
+    object_label = {}
+    for field in selected_field_list:
+        if field in label["label"]:
+            object_label[field] = label["label"][field]
+        else:
+            object_label[field] = None
+    assistant_object = {
+        "role": "assistant_gt",
+        "content": [
+            {
+                "type": "text",
+                "text": json.dumps(object_label, indent=4),
+            }
+        ],
+    }
+    conversations.append(assistant_object)
+
+    return conversations
+
+
+def prepare_vqa(
+    label_json_path: str,
+    prompt_template_path: str,
+    system_prompt_path: str,
+    json_schema_path: str,
+    media_dir: str,
+    output_vqa_json_path: str,
+):
+    try:
+        label_data = json.load(open(label_json_path))
+
+        prompt_templates = load_prompt_templates(prompt_template_path)
+        with open(system_prompt_path) as system_prompt_file:
+            system_prompt = system_prompt_file.read()
+
+        with open(json_schema_path) as json_schema_file:
+            json_schema = json.load(json_schema_file)
+    except Exception as e:
+        print(f"Error: {e}")
+        return
+
+    vqa = []
+    for label in label_data:
+        # random select 5 question answer pairs from the templates in english
+        for _ in range(10):
+            vqa_object = match_question_to_template(
+                prompt_templates, "en", system_prompt, json_schema, label, media_dir
+            )
+            if vqa_object is not None:
+                vqa.append(vqa_object)
+
+    with open(output_vqa_json_path, "w") as output_file:
+        output_file.write(json.dumps(vqa, indent=4))
+
+
 # --- Main execution ---
 if __name__ == "__main__":
-    label_data = json.load(
-        open(
-            "/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_1/label_data.json"
-        )
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("--label_json_path", type=str)
+    argparser.add_argument("--prompt_template_path", type=str)
+    argparser.add_argument("--system_prompt_path", type=str)
+    argparser.add_argument("--json_schema_path", type=str)
+    argparser.add_argument("--media_dir", type=str)
+    argparser.add_argument("--output_vqa_json_path", type=str)
+    args = argparser.parse_args()
+
+    prepare_vqa(
+        args.label_json_path,
+        args.prompt_template_path,
+        args.system_prompt_path,
+        args.json_schema_path,
+        args.media_dir,
+        args.output_vqa_json_path,
    )
-    # 1. Load the templates
-    prompt_templates = load_prompt_templates("prompt_templates.json")
-
-    # 2. Define questions to ask in both English and French
-    user_question_en = "Who is the doctor?"
-    user_question_fr = "Aperçu de la facturation"
-    user_question_invalid = "What is the weather?"
-
-    # 3. Get the label (sub-object) from the prompts
-    if prompt_templates:
-        answer_en = get_label_from_prompt(
-            user_question_en, label_data, prompt_templates
-        )
-        answer_fr = get_label_from_prompt(
-            user_question_fr, label_data, prompt_templates
-        )
-        answer_invalid = get_label_from_prompt(
-            user_question_invalid, label_data, prompt_templates
-        )
-
-        print(f"Question (EN): '{user_question_en}'")
-        print("Answer (JSON Object):")
-        print(json.dumps(answer_en, indent=2, ensure_ascii=False))
-        print("-" * 20)
-
-        print(f"Question (FR): '{user_question_fr}'")
-        print("Answer (JSON Object):")
-        print(json.dumps(answer_fr, indent=2, ensure_ascii=False))
-        print("-" * 20)
-
-        print(f"Question (Invalid): '{user_question_invalid}'")
-        print("Answer (JSON Object):")
-        print(json.dumps(answer_invalid, indent=2, ensure_ascii=False))
-        print("-" * 20)
--- a/easydistill/mmkd/facture_json_schema.json
+++ b/easydistill/mmkd/facture_json_schema.json
@@ -0,0 +1,211 @@
+{
+    "type": "object ",
+    "properties": {
+        "is_bill": {
+            "type": "boolean",
+            "description": "True if the document is an invoice, false otherwise."
+        },
+        "profession": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Type of healthcare profession, if it is presented in the list [Optique, Kinésiologie, Kinésithérapie, Pharmacie, Biologie, Psychologie, Infirmier, Ostéopathie, Dentaire, Sage-femme, Sophrologie, Soins hospitaliers, Orthopédie, Podologie, Diététique, Radiologie, Orthophonie, Pédiatrie, Assurance Maladie, Pompes funèbres, Laboratoire, Gynécologie-obstétrique, Chiropractie, Psychomotricité, Ostéodensitométrie, Pneumologie, Vaccins, Sevrage tabagique, Contraception, Homéopathie, Acupunture], Unknown otherwise."
+        },
+        "adeli_number": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Adeli number (9-digit identifier) associated with the healthcare provider"
+        },
+        "rpps_number": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "11 digits identifier, indicated after the term 'RPPS'"
+        },
+        "finess_number": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "9 digits identifier, indicated after one of the terms in list ['finess', 'identifiant CPAM']"
+        },
+        "doctor_name": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Full name of the doctor"
+        },
+        "prescripteur_finess_number": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Finess number of the prescriber in the invoice (9 digits identifier, indicated after the term 'finess')"
+        },
+        "total_billed": {
+            "type": [
+                "number",
+                "null"
+            ],
+            "description": "The total amount billed on the invoice"
+        },
+        "bill_paid": {
+            "type": "boolean",
+            "description": "True if the invoice has been paid, false otherwise (Look for terms like: 'acquittée', 'payée', 'quittance', 'réglée', 'certifie avoir reçu le règlement')"
+        },
+        "amount_paid": {
+            "type": [
+                "number",
+                "null"
+            ],
+            "description": "The amount paid for the invoice"
+        },
+        "mandatory_coverage": {
+            "type": [
+                "number",
+                "null"
+            ],
+            "description": "Amount covered by compulsory health insurance (indicated after terms like 'AMO', 'Rbmt RO', 'CAISSE', 'Noemie', etc.)"
+        },
+        "complementary_coverage": {
+            "type": [
+                "number",
+                "null"
+            ],
+            "description": "Amount covered by complementary insurance (indicated after terms like 'AMC', 'RC', 'Mutuelle')"
+        },
+        "client_part": {
+            "type": [
+                "number",
+                "null"
+            ],
+            "description": "Amount paid by client (indicated after terms like 'ASSURE', 'Part Client', 'Part Assuré')"
+        },
+        "remaining_payment": {
+            "type": [
+                "number",
+                "null"
+            ],
+            "description": "The remaining balance to be paid by the beneficiary if the invoice is unpaid."
+        },
+        "insured_name": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Full name of the insured person (indicated after terms like 'Assure')"
+        },
+        "insured_dob": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Date of birth of the insured person (format: dd-mm-yyyy)"
+        },
+        "beneficiary_name": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Full name of the invoice beneficiary"
+        },
+        "beneficiary_dob": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Date of birth of the beneficiary (format: dd-mm-yyyy)"
+        },
+        "care_start_date": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Care start date (format: dd-mm-yyyy)"
+        },
+        "care_end_date": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Care end date (format: dd-mm-yyyy)"
+        },
+        "invoice_date": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Date of the invoice (format: dd-mm-yyyy)"
+        },
+        "security_number": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Social Security number (13 or 15 digit identifier, indicated after terms like 'Sécurité Social' ou 'N° INSEE' ou 'N° SS')"
+        },
+        "invoice_issuer": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Name or organization issuing the invoice or providing the service"
+        },
+        "currency": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Currency used (e.g., EUR, USD)"
+        },
+        "items": {
+            "type": "array",
+            "description": "List of items or services included in the invoice.",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "description": {
+                        "type": [
+                            "string",
+                            "null"
+                        ],
+                        "description": "Description of the item or service."
+                    },
+                    "quantity": {
+                        "type": [
+                            "number",
+                            "null"
+                        ],
+                        "description": "Quantity of the item or service."
+                    },
+                    "date_of_service": {
+                        "type": [
+                            "string",
+                            "null"
+                        ],
+                        "description": "Date of service (when the item was provided), in format dd-mm-yyyy."
+                    },
+                    "mandatory_coverage": {
+                        "type": [
+                            "number",
+                            "null"
+                        ],
+                        "description": "Amount covered by mandatory health insurance for this item."
+                    },
+                    "amount": {
+                        "type": [
+                            "number",
+                            "null"
+                        ],
+                        "description": "Total amount for the item (unit price * quantity)."
+                    }
+                }
+            }
+        }
+    }
+}
--- a/easydistill/mmkd/prompt_templates.json
+++ b/easydistill/mmkd/prompt_templates.json
--- a/easydistill/mmkd/system_prompts.txt
+++ b/easydistill/mmkd/system_prompts.txt
@@ -0,0 +1,19 @@
+You are an advanced AI agent created by Rizlum AI. You are designed to extract structured information from health invoices with high accuracy. Your task is to parse invoices and answer the user questions.
+
+### **General Instructions**
+1. **Extract Only the Specified Fields**: Do not include extra information.
+2. **Do Not Guess or hallucinate if information is missing or represented by placeholders (e.g., dots, dashes).**
+3. **Ignore irrelevant fields (e.g., address, SIRET, membership numbers).**.
+4. **Ensure Strictly Valid JSON Output**: Do not return additional text or explanations.
+5. **Field Relationship Guidance**: Formula: total_bill = mandatory_coverage + complementary_coverage + client_part. Instruction: Prioritize extracting all values directly and only if they appear on the invoice. This formula is a guide to verify the consistency of extracted numbers, not a command to calculate a missing total_bill
+
+### **Handling Ambiguous Cases**
+- **Adeli Number**: If a 9-digit number appears without the keyword 'Adeli', check if it matches the Adeli number format and is associated with a recognized healthcare professional.
+- **Doctor Selection**:
+   - If the invoice shows multiple doctors, exclude any doctor that is visibly crossed out.
+   - Prioritize doctor information (e.g., name, Adeli, RPPS) within a stamp (identified by visual stamp features like borders or official markings) over unstamped doctor blocks. Exclude unstamped doctor information if a stamped block exists.
+- **Item Selection in Tables**:
+  - If multiple items or acts are listed, extract only those that are highlighted (e.g., marked with color).
+  - Ignore all other items that are not explicitly marked or priced.
+- **Date**:
+  - Distinguish carefully between similar characters: treat '/1' as '1' (e.g., January), not '11' (e.g., November), by focusing on stroke separation and context rather than assuming a slash implies a specific number.
Author	SHA1	Message	Date
lphatnguyen	0393ae3ab4	[Fix] prompt templates	2025-08-08 22:16:45 +00:00
lphatnguyen	14cc0e118e	[Add] Subfield VQA Generation	2025-08-08 22:16:14 +00:00