3 Commits

Author SHA1 Message Date
03bddf60ce modify gen_vqa_bank 2025-08-08 15:07:37 +00:00
3b43f89df5 modify gen_vqa_bank to randomly select ratio number of fields to ask 2025-08-08 14:20:33 +00:00
bbefb444a9 modify gen_vqa_bank 2025-08-07 15:45:55 +00:00
7 changed files with 942 additions and 698 deletions

View File

@@ -1,14 +1,12 @@
import json
import numpy as np
import argparse
import os
import re
def load_prompt_templates(filepath):
"""Loads the prompt templates from a JSON file."""
try:
with open(filepath, "r", encoding="utf-8") as f:
return json.load(f)["templates"]
return json.load(f)
except FileNotFoundError:
print(f"Error: The file {filepath} was not found.")
return None
@@ -80,131 +78,44 @@ def get_label_from_prompt(question, data, templates):
return {"error": "No matching prompt found."}
def match_question_to_template(
templates: str,
language: str,
system_prompt: str,
json_schema: dict,
label: dict,
media_dir: str,
):
# Preparing system prompt
conversations = [{"role": "system", "content": system_prompt}]
# Preparing user prompt
# Select randomly from the template list
template = np.random.choice(templates)
selected_field_list = template["target_keys"]
# select field from json_schema
prompt_object = {}
for field in selected_field_list:
prompt_object[field] = json_schema["properties"][field]
prompt_object_string = json.dumps(prompt_object, indent=4)
user_question = f"""Extract the following structured information from the provided invoice. Fill in only existing values.
Strictly return a valid JSON following this schema:
**Json schema**
{prompt_object_string}
"""
fns = os.listdir(media_dir)
image_paths = []
if "image" in label:
image_substring = label["image"]
for fn in fns:
if image_substring in fn:
image_paths.append(media_dir + fn)
elif "image_files" in label:
for image_path in label["image_files"]:
if os.path.exists(media_dir + image_path):
image_paths.append(media_dir + image_path)
else:
return None
else:
return None
image_contents = [
{"type": "image", "image": image_path} for image_path in image_paths
]
user_contents = image_contents + [
{"type": "text", "text": "<image>" * len(image_contents) + user_question},
]
user_object = {"role": "user", "content": user_contents}
conversations.append(user_object)
# Preparing assistant output
object_label = {}
for field in selected_field_list:
if field in label["label"]:
object_label[field] = label["label"][field]
else:
object_label[field] = None
assistant_object = {
"role": "assistant_gt",
"content": [
{
"type": "text",
"text": json.dumps(object_label, indent=4),
}
],
}
conversations.append(assistant_object)
return conversations
def prepare_vqa(
label_json_path: str,
prompt_template_path: str,
system_prompt_path: str,
json_schema_path: str,
media_dir: str,
output_vqa_json_path: str,
):
try:
label_data = json.load(open(label_json_path))
prompt_templates = load_prompt_templates(prompt_template_path)
with open(system_prompt_path) as system_prompt_file:
system_prompt = system_prompt_file.read()
with open(json_schema_path) as json_schema_file:
json_schema = json.load(json_schema_file)
except Exception as e:
print(f"Error: {e}")
return
vqa = []
for label in label_data:
# random select 5 question answer pairs from the templates in english
for _ in range(10):
vqa_object = match_question_to_template(
prompt_templates, "en", system_prompt, json_schema, label, media_dir
)
if vqa_object is not None:
vqa.append(vqa_object)
with open(output_vqa_json_path, "w") as output_file:
output_file.write(json.dumps(vqa, indent=4))
# --- Main execution ---
if __name__ == "__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument("--label_json_path", type=str)
argparser.add_argument("--prompt_template_path", type=str)
argparser.add_argument("--system_prompt_path", type=str)
argparser.add_argument("--json_schema_path", type=str)
argparser.add_argument("--media_dir", type=str)
argparser.add_argument("--output_vqa_json_path", type=str)
args = argparser.parse_args()
prepare_vqa(
args.label_json_path,
args.prompt_template_path,
args.system_prompt_path,
args.json_schema_path,
args.media_dir,
args.output_vqa_json_path,
label_data = json.load(
open(
"/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_1/label_data.json"
)
)
# 1. Load the templates
prompt_templates = load_prompt_templates("prompt_templates.json")
# 2. Define questions to ask in both English and French
user_question_en = "Who is the doctor?"
user_question_fr = "Aperçu de la facturation"
user_question_invalid = "What is the weather?"
# 3. Get the label (sub-object) from the prompts
if prompt_templates:
answer_en = get_label_from_prompt(
user_question_en, label_data, prompt_templates
)
answer_fr = get_label_from_prompt(
user_question_fr, label_data, prompt_templates
)
answer_invalid = get_label_from_prompt(
user_question_invalid, label_data, prompt_templates
)
print(f"Question (EN): '{user_question_en}'")
print("Answer (JSON Object):")
print(json.dumps(answer_en, indent=2, ensure_ascii=False))
print("-" * 20)
print(f"Question (FR): '{user_question_fr}'")
print("Answer (JSON Object):")
print(json.dumps(answer_fr, indent=2, ensure_ascii=False))
print("-" * 20)
print(f"Question (Invalid): '{user_question_invalid}'")
print("Answer (JSON Object):")
print(json.dumps(answer_invalid, indent=2, ensure_ascii=False))
print("-" * 20)

View File

@@ -5,12 +5,13 @@ from pathlib import Path
import glob
import re
def load_json(filepath):
"""
Loads a JSON file with robust error handling.
Loads a JSON file .
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
with open(filepath, "r", encoding="utf-8") as f:
return json.load(f)
except FileNotFoundError:
print(f"Error: The file was not found at {filepath}")
@@ -19,20 +20,22 @@ def load_json(filepath):
print(f"Error: The file at {filepath} is not a valid JSON file. Details: {e}")
return None
def read_text_file(filepath):
"""
Loads a simple text file.
Loads a prompt from a text file.
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
with open(filepath, "r", encoding="utf-8") as f:
return f.read().strip()
except FileNotFoundError:
print(f"Error: The file was not found at {filepath}")
return None
def format_items_list(items, language):
"""
Formats a list of item dictionaries into a human-readable string.
Formats a list of item dictionaries (services) into a human-readable string.
"""
if not items:
return ""
@@ -55,7 +58,11 @@ def format_items_list(items, language):
parts.append(f"{date_str}: {date}")
mandatory = item.get("mandatory_coverage")
if mandatory is not None:
amo_str = "Mandatory Coverage" if language == "english" else "Couverture obligatoire"
amo_str = (
"Mandatory Coverage"
if language == "english"
else "Couverture obligatoire"
)
parts.append(f"{amo_str}: {mandatory}")
amount = item.get("amount")
if amount is not None:
@@ -64,11 +71,14 @@ def format_items_list(items, language):
formatted_lines.append("- " + ", ".join(parts))
return "\n".join(formatted_lines)
def get_conversational_answer(field, label_data, answer_bank, language):
"""
Generates a complete conversational answer by selecting a template and filling it
with the appropriate value from the label data.
"""
if not isinstance(label_data, dict):
return ""
value = label_data.get(field)
field_templates = answer_bank.get(field)
@@ -91,8 +101,17 @@ def get_conversational_answer(field, label_data, answer_bank, language):
return template.format(value=value)
return str(value) if value is not None else ""
# --- Conversations Generation for Label Data ---
def generate_field_level_conversations(labels_path, image_root, system_prompt_path, questions_path, answers_path, output_path):
def generate_vqa_conversations(
labels_path,
image_root,
system_prompt_path,
questions_path,
answers_path,
output_path,
ratio=0.4,
):
"""
Generates multiple conversational VQA pairs for each field in a label file,
and handles multi-page documents.
@@ -102,7 +121,12 @@ def generate_field_level_conversations(labels_path, image_root, system_prompt_pa
question_bank = load_json(questions_path)
answer_bank = load_json(answers_path)
if not all_data_entries or not system_prompt or not question_bank or not answer_bank:
if (
not all_data_entries
or not system_prompt
or not question_bank
or not answer_bank
):
print("Could not load one or more necessary files. Exiting.")
return
@@ -117,6 +141,14 @@ def generate_field_level_conversations(labels_path, image_root, system_prompt_pa
if not label_data or not image_filename_prefix:
continue
# Get a list of all fields in the label data
# all_fields = [field for field in label_data if isinstance(field, str) and field in question_bank]
all_fields = list(question_bank.keys())
# Determine how many questions to ask based on the available fields
num_to_sample = max(1, int(len(all_fields) * ratio))
# Randomly select fields to ask questions about
fields_to_ask = random.sample(all_fields, num_to_sample)
# Find all image files in the image_root that start with the prefix.
# This handles cases like 'doc-1.jpg', 'doc-2.jpg', 'doc_scale.jpg' etc.
prefix_stem = Path(image_filename_prefix).stem
@@ -124,59 +156,62 @@ def generate_field_level_conversations(labels_path, image_root, system_prompt_pa
found_image_paths = sorted(glob.glob(search_pattern))
if not found_image_paths:
print(f"Warning: No images found for prefix '{prefix_stem}' in '{image_root}'. Skipping.")
print(
f"Warning: No images found for prefix '{prefix_stem}' in '{image_root}'. Skipping."
)
continue
# Create a list of image dictionaries for the user message
image_content_list = [{"type": "image", "image": path} for path in found_image_paths]
image_content_list = [
{"type": "image", "image": path} for path in found_image_paths
]
# --- Create a new conversation for EACH field in the label ---
for field in label_data:
for field in fields_to_ask:
if not isinstance(field, str):
continue
if field not in question_bank:
continue
language = random.choice(['english', 'french'])
language = random.choice(["english", "french"])
# Get the question from the question bank
question_text = random.choice(question_bank[field][language])
# Get the conversational answer from the answer bank
answer_text = get_conversational_answer(field, label_data, answer_bank, language)
answer_text = get_conversational_answer(
field, label_data, answer_bank, language
)
# --- Assemble the conversation in the desired format ---
system_message = {
"role": "system",
"content": system_prompt
}
system_message = {"role": "system", "content": system_prompt}
user_message = {
"role": "user",
# The content is the list of image dicts, followed by the text dict
"content": image_content_list + [{"type": "text", "text": "<image>"+ question_text}]
"content": image_content_list
+ [{"type": "text", "text": "<image>" + question_text}],
}
assistant_message = {
"role": "assistant",
"content": answer_text
}
assistant_message = {"role": "assistant", "content": answer_text}
conversation = [system_message, user_message, assistant_message]
final_conversations.append(conversation)
# Save the final list of conversations to the output file
with open(output_path, 'w', encoding='utf-8') as f:
with open(output_path, "w", encoding="utf-8") as f:
json.dump(final_conversations, f, indent=4, ensure_ascii=False)
print(f"Success! Generated {len(final_conversations)} conversational VQA entries.")
print(f"Formatted data saved to: {output_path}")
# --- Conversations Generation for only Images ---
def generate_image_only_conversations(image_root, system_prompt_path, questions_path, output_path):
def generate_vq_question(
image_root, system_prompt_path, questions_path, output_path, ratio=0.4
):
"""
Generates conversational VQA pairs for each document based on images only (no labels).
Groups all images with the same prefix (including _1_scale, _2_scale, etc.) into the same conversation.
Each conversation contains a system and user message for each question in the question bank.
"""
system_prompt = read_text_file(system_prompt_path)
@@ -187,51 +222,84 @@ def generate_image_only_conversations(image_root, system_prompt_path, questions_
return
# Find all images and group by prefix
all_image_paths = sorted(glob.glob(os.path.join(image_root, "*")))
all_image_paths = sorted(
glob.glob(os.path.join(image_root, "*.jpg"))
+ glob.glob(os.path.join(image_root, "*.png"))
+ glob.glob(os.path.join(image_root, "*.jpeg"))
)
prefix_to_images = {}
for path in all_image_paths:
if not os.path.isfile(path):
continue
stem = Path(path).stem
# Remove suffixes like _1_scale, _2_scale, etc.
prefix = re.sub(r'(_\d+(_scale)?)$', '', stem)
prefix = re.sub(r"(_\d+(_scale)?)$", "", stem)
prefix_to_images.setdefault(prefix, []).append(path)
# Get a list of all possible fields from the question bank.
all_fields = list(question_bank.keys())
# Determine how many questions to ask based on the available fields
num_to_sample = max(1, int(len(all_fields) * ratio))
final_conversations = []
for prefix, image_paths in prefix_to_images.items():
image_content_list = [{"type": "image", "image": path} for path in sorted(image_paths)]
for field, lang_dict in question_bank.items():
for language in lang_dict:
for question_text in lang_dict[language]:
system_message = {
"role": "system",
"content": system_prompt
}
user_message = {
"role": "user",
"content": image_content_list + [{"type": "text", "text": "<image>" + question_text}]
}
conversation = [system_message, user_message]
final_conversations.append(conversation)
image_content_list = [
{"type": "image", "image": path} for path in sorted(image_paths)
]
with open(output_path, 'w', encoding='utf-8') as f:
# Randomly select fields to ask questions about
fields_to_ask = random.sample(all_fields, num_to_sample)
for field in fields_to_ask:
language = random.choice(["english", "french"])
question_text = random.choice(question_bank[field][language])
system_message = {"role": "system", "content": system_prompt}
user_message = {
"role": "user",
"content": image_content_list
+ [{"type": "text", "text": "<image>" + question_text}],
}
conversation = [system_message, user_message]
final_conversations.append(conversation)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(final_conversations, f, indent=4, ensure_ascii=False)
print(f"Success! Generated {len(final_conversations)} image-only conversational VQA entries.")
print(
f"Success! Generated {len(final_conversations)} image-only conversational VQA entries."
)
print(f"Formatted data saved to: {output_path}")
# --- Main Execution Block ---
if __name__ == "__main__":
# Define file paths
IMAGE_ROOT = '/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_1'
LABELS_FILE = os.path.join(IMAGE_ROOT, 'label_data.json')
SYSTEM_PROMPT_FILE = '/home/nguyendc/phong-dev/distill/prompt/system_prompt.txt'
QUESTION_BANK_FILE = '/home/nguyendc/phong-dev/distill/prompt/question_bank.json'
ANSWER_BANK_FILE = '/home/nguyendc/phong-dev/distill/prompt/answer_bank.json'
OUTPUT_FILE = os.path.join(IMAGE_ROOT, 'vqa_nolabel.json')
IMAGE_ROOT = "/home/nguyendc/model-factory/Finetuning-Automation/etc/data/media/docai_mgp_facture_v2_1"
LABELS_FILE = os.path.join(IMAGE_ROOT, "label_data.json")
SYSTEM_PROMPT_FILE = os.path.join(IMAGE_ROOT, "system_prompt.txt")
UNSTRUCTURED_PROMPT_FILE = "/home/nguyendc/phong-dev/distillation/easydistill/mmkd/dev-vqa/qa_bank/unstructured_prompt.txt"
QUESTION_BANK_FILE = "/home/nguyendc/phong-dev/distill/prompt/question_bank.json"
ANSWER_BANK_FILE = "/home/nguyendc/phong-dev/distill/prompt/answer_bank.json"
OUTPUT_FILE = "/home/nguyendc/phong-dev/distill/vqa_label.json"
QUESTION_RATIO = 0.4
# Run the main generation function
# generate_field_level_conversations(LABELS_FILE, IMAGE_ROOT, SYSTEM_PROMPT_FILE, QUESTION_BANK_FILE, ANSWER_BANK_FILE, OUTPUT_FILE)
generate_image_only_conversations(IMAGE_ROOT, SYSTEM_PROMPT_FILE, QUESTION_BANK_FILE, OUTPUT_FILE)
generate_vqa_conversations(
LABELS_FILE,
IMAGE_ROOT,
UNSTRUCTURED_PROMPT_FILE,
QUESTION_BANK_FILE,
ANSWER_BANK_FILE,
OUTPUT_FILE,
QUESTION_RATIO,
)
# generate_vq_question(
# IMAGE_ROOT,
# UNSTRUCTURED_PROMPT_FILE,
# QUESTION_BANK_FILE,
# OUTPUT_FILE,
# QUESTION_RATIO,
# )

View File

@@ -1,4 +1,4 @@
You are an advanced AI agent created by Rizlum AI. You are designed to extract structured information from health invoices with high accuracy. Your task is to parse invoices and answer the user questions.
You are an advanced AI agent created by Rizlum AI. You are designed to extract structured information from health invoices with high accuracy. Your task is to parse invoices and return only the requested fields in a strict JSON format.
### **General Instructions**
1. **Extract Only the Specified Fields**: Do not include extra information.

View File

@@ -0,0 +1,5 @@
You are an advanced AI agent created by Rizlum AI. Your task is to parse invoices and return only the requested information.
### **General Instructions**
1. **Extract Only the Specified Fields**: Do not include extra information.
2. **Do Not Guess or hallucinate if information is missing or represented by placeholders (e.g., dots, dashes).**

View File

@@ -0,0 +1,127 @@
Extract the following structured information from the provided invoice. Fill in only existing values.
Strictly return a valid JSON following this schema:
**Json schema**
{
"type": "object ",
"properties": {
"is_bill": {
"type": "boolean",
"description": "True if the document is an invoice, false otherwise."
},
"profession": {
"type": ["string", "null"],
"description": "Type of healthcare profession, if it is presented in the list [Optique, Kinésiologie, Kinésithérapie, Pharmacie, Biologie, Psychologie, Infirmier, Ostéopathie, Dentaire, Sage-femme, Sophrologie, Soins hospitaliers, Orthopédie, Podologie, Diététique, Radiologie, Orthophonie, Pédiatrie, Assurance Maladie, Pompes funèbres, Laboratoire, Gynécologie-obstétrique, Chiropractie, Psychomotricité, Ostéodensitométrie, Pneumologie, Vaccins, Sevrage tabagique, Contraception, Homéopathie, Acupunture], Unknown otherwise."
},
"adeli_number": {
"type": ["string", "null"],
"description": "Adeli number (9-digit identifier) associated with the healthcare provider"
},
"rpps_number": {
"type": ["string", "null"],
"description": "11 digits identifier, indicated after the term 'RPPS'"
},
"finess_number": {
"type": ["string", "null"],
"description": "9 digits identifier, indicated after one of the terms in list ['finess', 'identifiant CPAM']"
},
"doctor_name": {
"type": ["string", "null"],
"description": "Full name of the doctor"
},
"prescripteur_finess_number": {
"type": ["string", "null"],
"description": "Finess number of the prescriber in the invoice (9 digits identifier, indicated after the term 'finess')"
},
"total_billed": {
"type": ["number", "null"],
"description": "The total amount billed on the invoice"
},
"bill_paid": {
"type": "boolean",
"description": "True if the invoice has been paid, false otherwise (Look for terms like: 'acquittée', 'payée', 'quittance', 'réglée', 'certifie avoir reçu le règlement')"
},
"amount_paid": {
"type": ["number", "null"],
"description": "The amount paid for the invoice"
},
"mandatory_coverage": {
"type": ["number", "null"],
"description": "Amount covered by compulsory health insurance (indicated after terms like 'AMO', 'Rbmt RO', 'CAISSE', 'Noemie', etc.)"
},
"complementary_coverage": {
"type": ["number", "null"],
"description": "Amount covered by complementary insurance (indicated after terms like 'AMC', 'RC', 'Mutuelle')"
},
"client_part": {
"type": ["number", "null"],
"description": "Amount paid by client (indicated after terms like 'ASSURE', 'Part Client', 'Part Assuré')"
},
"remaining_payment": {
"type": ["number", "null"],
"description": "The remaining balance to be paid by the beneficiary if the invoice is unpaid."
},
"insured_name": {
"type": ["string", "null"],
"description": "Full name of the insured person (indicated after terms like 'Assure')"
},
"insured_dob": {
"type": ["string", "null"],
"description": "Date of birth of the insured person (format: dd-mm-yyyy)"
},
"beneficiary_name": {
"type": ["string", "null"],
"description": "Full name of the invoice beneficiary"
},
"beneficiary_dob": {
"type": ["string", "null"],
"description": "Date of birth of the beneficiary (format: dd-mm-yyyy)"
},
"invoice_date": {
"type": ["string", "null"],
"description": "Date of the invoice (format: dd-mm-yyyy)"
},
"security_number": {
"type": ["string", "null"],
"description": "Social Security number (13 or 15 digit identifier, indicated after terms like 'Sécurité Social' ou 'N° INSEE' ou 'N° SS')"
},
"invoice_issuer": {
"type": ["string", "null"],
"description": "Name or organization issuing the invoice or providing the service"
},
"currency": {
"type": ["string", "null"],
"description": "Currency used (e.g., EUR, USD)"
},
"items": {
"type": "array",
"description": "List of items or services included in the invoice.",
"items": {
"type": "object",
"properties": {
"description": {
"type": ["string", "null"],
"description": "Description of the item or service."
},
"quantity": {
"type": ["number", "null"],
"description": "Quantity of the item or service."
},
"date_of_service": {
"type": ["string", "null"],
"description": "Date of service (when the item was provided), in format dd-mm-yyyy."
},
"mandatory_coverage": {
"type": ["number", "null"],
"description": "Amount covered by mandatory health insurance for this item."
},
"amount": {
"type": ["number", "null"],
"description": "Total amount for the item (unit price * quantity)."
}
}
}
}
}
}

View File

@@ -1,211 +0,0 @@
{
"type": "object ",
"properties": {
"is_bill": {
"type": "boolean",
"description": "True if the document is an invoice, false otherwise."
},
"profession": {
"type": [
"string",
"null"
],
"description": "Type of healthcare profession, if it is presented in the list [Optique, Kinésiologie, Kinésithérapie, Pharmacie, Biologie, Psychologie, Infirmier, Ostéopathie, Dentaire, Sage-femme, Sophrologie, Soins hospitaliers, Orthopédie, Podologie, Diététique, Radiologie, Orthophonie, Pédiatrie, Assurance Maladie, Pompes funèbres, Laboratoire, Gynécologie-obstétrique, Chiropractie, Psychomotricité, Ostéodensitométrie, Pneumologie, Vaccins, Sevrage tabagique, Contraception, Homéopathie, Acupunture], Unknown otherwise."
},
"adeli_number": {
"type": [
"string",
"null"
],
"description": "Adeli number (9-digit identifier) associated with the healthcare provider"
},
"rpps_number": {
"type": [
"string",
"null"
],
"description": "11 digits identifier, indicated after the term 'RPPS'"
},
"finess_number": {
"type": [
"string",
"null"
],
"description": "9 digits identifier, indicated after one of the terms in list ['finess', 'identifiant CPAM']"
},
"doctor_name": {
"type": [
"string",
"null"
],
"description": "Full name of the doctor"
},
"prescripteur_finess_number": {
"type": [
"string",
"null"
],
"description": "Finess number of the prescriber in the invoice (9 digits identifier, indicated after the term 'finess')"
},
"total_billed": {
"type": [
"number",
"null"
],
"description": "The total amount billed on the invoice"
},
"bill_paid": {
"type": "boolean",
"description": "True if the invoice has been paid, false otherwise (Look for terms like: 'acquittée', 'payée', 'quittance', 'réglée', 'certifie avoir reçu le règlement')"
},
"amount_paid": {
"type": [
"number",
"null"
],
"description": "The amount paid for the invoice"
},
"mandatory_coverage": {
"type": [
"number",
"null"
],
"description": "Amount covered by compulsory health insurance (indicated after terms like 'AMO', 'Rbmt RO', 'CAISSE', 'Noemie', etc.)"
},
"complementary_coverage": {
"type": [
"number",
"null"
],
"description": "Amount covered by complementary insurance (indicated after terms like 'AMC', 'RC', 'Mutuelle')"
},
"client_part": {
"type": [
"number",
"null"
],
"description": "Amount paid by client (indicated after terms like 'ASSURE', 'Part Client', 'Part Assuré')"
},
"remaining_payment": {
"type": [
"number",
"null"
],
"description": "The remaining balance to be paid by the beneficiary if the invoice is unpaid."
},
"insured_name": {
"type": [
"string",
"null"
],
"description": "Full name of the insured person (indicated after terms like 'Assure')"
},
"insured_dob": {
"type": [
"string",
"null"
],
"description": "Date of birth of the insured person (format: dd-mm-yyyy)"
},
"beneficiary_name": {
"type": [
"string",
"null"
],
"description": "Full name of the invoice beneficiary"
},
"beneficiary_dob": {
"type": [
"string",
"null"
],
"description": "Date of birth of the beneficiary (format: dd-mm-yyyy)"
},
"care_start_date": {
"type": [
"string",
"null"
],
"description": "Care start date (format: dd-mm-yyyy)"
},
"care_end_date": {
"type": [
"string",
"null"
],
"description": "Care end date (format: dd-mm-yyyy)"
},
"invoice_date": {
"type": [
"string",
"null"
],
"description": "Date of the invoice (format: dd-mm-yyyy)"
},
"security_number": {
"type": [
"string",
"null"
],
"description": "Social Security number (13 or 15 digit identifier, indicated after terms like 'Sécurité Social' ou 'N° INSEE' ou 'N° SS')"
},
"invoice_issuer": {
"type": [
"string",
"null"
],
"description": "Name or organization issuing the invoice or providing the service"
},
"currency": {
"type": [
"string",
"null"
],
"description": "Currency used (e.g., EUR, USD)"
},
"items": {
"type": "array",
"description": "List of items or services included in the invoice.",
"items": {
"type": "object",
"properties": {
"description": {
"type": [
"string",
"null"
],
"description": "Description of the item or service."
},
"quantity": {
"type": [
"number",
"null"
],
"description": "Quantity of the item or service."
},
"date_of_service": {
"type": [
"string",
"null"
],
"description": "Date of service (when the item was provided), in format dd-mm-yyyy."
},
"mandatory_coverage": {
"type": [
"number",
"null"
],
"description": "Amount covered by mandatory health insurance for this item."
},
"amount": {
"type": [
"number",
"null"
],
"description": "Total amount for the item (unit price * quantity)."
}
}
}
}
}
}

File diff suppressed because it is too large Load Diff