2025-08-07 08:38:26 +00:00
import json
import os
import random
from pathlib import Path
import glob
import re
2025-08-12 14:32:40 +00:00
import argparse
2025-08-07 08:38:26 +00:00
2025-08-08 15:07:37 +00:00
2025-08-07 08:38:26 +00:00
def load_json ( filepath ) :
"""
2025-08-07 15:45:55 +00:00
Loads a JSON file .
2025-08-07 08:38:26 +00:00
"""
try :
2025-08-08 15:07:37 +00:00
with open ( filepath , " r " , encoding = " utf-8 " ) as f :
2025-08-07 08:38:26 +00:00
return json . load ( f )
except FileNotFoundError :
print ( f " Error: The file was not found at { filepath } " )
return None
except json . JSONDecodeError as e :
print ( f " Error: The file at { filepath } is not a valid JSON file. Details: { e } " )
return None
2025-08-08 15:07:37 +00:00
2025-08-07 08:38:26 +00:00
def read_text_file ( filepath ) :
"""
2025-08-07 15:45:55 +00:00
Loads a prompt from a text file .
2025-08-07 08:38:26 +00:00
"""
try :
2025-08-08 15:07:37 +00:00
with open ( filepath , " r " , encoding = " utf-8 " ) as f :
2025-08-07 08:38:26 +00:00
return f . read ( ) . strip ( )
except FileNotFoundError :
print ( f " Error: The file was not found at { filepath } " )
return None
2025-08-08 15:07:37 +00:00
2025-08-07 08:38:26 +00:00
def format_items_list ( items , language ) :
"""
2025-08-07 15:45:55 +00:00
Formats a list of item dictionaries ( services ) into a human - readable string .
2025-08-07 08:38:26 +00:00
"""
if not items :
return " "
formatted_lines = [ ]
for item in items :
if not isinstance ( item , dict ) :
continue
parts = [ ]
desc = item . get ( " description " )
if desc is not None :
parts . append ( f " { desc } " )
qty = item . get ( " quantity " )
if qty is not None :
qty_str = " Quantity " if language == " english " else " Quantité "
parts . append ( f " { qty_str } : { qty } " )
date = item . get ( " date_of_service " )
if date is not None :
date_str = " Date " if language == " english " else " Date "
parts . append ( f " { date_str } : { date } " )
mandatory = item . get ( " mandatory_coverage " )
if mandatory is not None :
2025-08-08 15:07:37 +00:00
amo_str = (
" Mandatory Coverage "
if language == " english "
else " Couverture obligatoire "
)
2025-08-07 08:38:26 +00:00
parts . append ( f " { amo_str } : { mandatory } " )
amount = item . get ( " amount " )
if amount is not None :
amount_str = " Amount " if language == " english " else " Montant "
parts . append ( f " { amount_str } : { amount } " )
formatted_lines . append ( " - " + " , " . join ( parts ) )
return " \n " . join ( formatted_lines )
2025-08-08 15:07:37 +00:00
2025-08-07 08:38:26 +00:00
def get_conversational_answer ( field , label_data , answer_bank , language ) :
"""
Generates a complete conversational answer by selecting a template and filling it
with the appropriate value from the label data .
"""
2025-08-08 15:07:37 +00:00
if not isinstance ( label_data , dict ) :
return " "
2025-08-07 08:38:26 +00:00
value = label_data . get ( field )
field_templates = answer_bank . get ( field )
if not field_templates :
return str ( value ) if value is not None else " "
if value is None :
return random . choice ( field_templates . get ( " null " , { } ) . get ( language , [ " " ] ) )
if field == " items " :
template = random . choice ( field_templates [ language ] )
formatted_list_string = format_items_list ( value , language )
return template . format ( value = formatted_list_string )
if isinstance ( value , bool ) :
bool_key = str ( value ) . lower ( )
if bool_key in field_templates [ language ] :
return random . choice ( field_templates [ language ] [ bool_key ] )
return str ( value )
if isinstance ( field_templates [ language ] , list ) :
template = random . choice ( field_templates [ language ] )
return template . format ( value = value )
return str ( value ) if value is not None else " "
2025-08-08 15:07:37 +00:00
2025-08-07 08:38:26 +00:00
# --- Conversations Generation for Label Data ---
2025-08-08 15:07:37 +00:00
def generate_vqa_conversations (
labels_path ,
image_root ,
system_prompt_path ,
questions_path ,
answers_path ,
output_path ,
ratio = 0.4 ,
) :
2025-08-07 08:38:26 +00:00
"""
Generates multiple conversational VQA pairs for each field in a label file ,
and handles multi - page documents .
"""
all_data_entries = load_json ( labels_path )
system_prompt = read_text_file ( system_prompt_path )
question_bank = load_json ( questions_path )
answer_bank = load_json ( answers_path )
2025-08-08 15:07:37 +00:00
if (
not all_data_entries
or not system_prompt
or not question_bank
or not answer_bank
) :
2025-08-07 08:38:26 +00:00
print ( " Could not load one or more necessary files. Exiting. " )
return
final_conversations = [ ]
# Process each entry in the main label file
for entry in all_data_entries :
label_data = entry . get ( " label " )
image_filename_prefix = entry . get ( " image " )
# Skip entries that are unlabeled, as we need the label to generate Q&A pairs
if not label_data or not image_filename_prefix :
continue
2025-08-08 14:20:33 +00:00
# Get a list of all fields in the label data
2025-08-08 15:07:37 +00:00
# all_fields = [field for field in label_data if isinstance(field, str) and field in question_bank]
all_fields = list ( question_bank . keys ( ) )
2025-08-08 14:20:33 +00:00
# Determine how many questions to ask based on the available fields
num_to_sample = max ( 1 , int ( len ( all_fields ) * ratio ) )
# Randomly select fields to ask questions about
fields_to_ask = random . sample ( all_fields , num_to_sample )
2025-08-08 15:07:37 +00:00
2025-08-07 08:38:26 +00:00
# Find all image files in the image_root that start with the prefix.
# This handles cases like 'doc-1.jpg', 'doc-2.jpg', 'doc_scale.jpg' etc.
prefix_stem = Path ( image_filename_prefix ) . stem
search_pattern = os . path . join ( image_root , f " { prefix_stem } * " )
found_image_paths = sorted ( glob . glob ( search_pattern ) )
if not found_image_paths :
2025-08-08 15:07:37 +00:00
print (
f " Warning: No images found for prefix ' { prefix_stem } ' in ' { image_root } ' . Skipping. "
)
2025-08-07 08:38:26 +00:00
continue
2025-08-08 15:07:37 +00:00
2025-08-07 08:38:26 +00:00
# Create a list of image dictionaries for the user message
2025-08-08 15:07:37 +00:00
image_content_list = [
{ " type " : " image " , " image " : path } for path in found_image_paths
]
2025-08-07 08:38:26 +00:00
# --- Create a new conversation for EACH field in the label ---
2025-08-08 14:20:33 +00:00
for field in fields_to_ask :
2025-08-07 08:38:26 +00:00
if not isinstance ( field , str ) :
continue
if field not in question_bank :
continue
2025-08-08 15:07:37 +00:00
language = random . choice ( [ " english " , " french " ] )
2025-08-07 08:38:26 +00:00
# Get the question from the question bank
question_text = random . choice ( question_bank [ field ] [ language ] )
2025-08-08 15:07:37 +00:00
2025-08-07 08:38:26 +00:00
# Get the conversational answer from the answer bank
2025-08-08 15:07:37 +00:00
answer_text = get_conversational_answer (
field , label_data , answer_bank , language
)
2025-08-07 08:38:26 +00:00
# --- Assemble the conversation in the desired format ---
2025-08-08 15:07:37 +00:00
system_message = { " role " : " system " , " content " : system_prompt }
2025-08-07 08:38:26 +00:00
user_message = {
" role " : " user " ,
# The content is the list of image dicts, followed by the text dict
2025-08-08 15:07:37 +00:00
" content " : image_content_list
2025-08-14 08:35:10 +00:00
+ [ { " type " : " text " , " text " : " <image> " * len ( found_image_paths ) + question_text } ] ,
2025-08-07 08:38:26 +00:00
}
2025-08-08 15:07:37 +00:00
assistant_message = { " role " : " assistant " , " content " : answer_text }
2025-08-07 08:38:26 +00:00
conversation = [ system_message , user_message , assistant_message ]
final_conversations . append ( conversation )
# Save the final list of conversations to the output file
2025-08-08 15:07:37 +00:00
with open ( output_path , " w " , encoding = " utf-8 " ) as f :
2025-08-07 08:38:26 +00:00
json . dump ( final_conversations , f , indent = 4 , ensure_ascii = False )
print ( f " Success! Generated { len ( final_conversations ) } conversational VQA entries. " )
print ( f " Formatted data saved to: { output_path } " )
2025-08-13 20:51:49 +00:00
# --- Conversations Generation for Multi-Turn Dialogues ---
def generate_multiturn_conversations (
labels_path ,
image_root ,
system_prompt_path ,
questions_path ,
answers_path ,
output_path ,
) :
"""
Generates multi - turn conversational VQA pairs based on predefined field groups .
"""
all_data_entries = load_json ( labels_path )
system_prompt = read_text_file ( system_prompt_path )
question_bank = load_json ( questions_path )
answer_bank = load_json ( answers_path )
if (
not all_data_entries
or not system_prompt
or not question_bank
or not answer_bank
) :
print ( " Could not load one or more necessary files. Exiting. " )
return
# --- MODIFICATION: Define the field groupings for multi-turn conversations ---
CONVERSATION_GROUPS = {
" doctor_name " : [ " profession " , " finess_number " , " rpps_number " , " adeli_number " ] ,
" beneficiary_name " : [ " beneficiary_dob " , " security_number " ] ,
" bill_paid " : [ " mandatory_coverage " , " complementary_coverage " , " client_part " , " amount_paid " ] ,
}
final_conversations = [ ]
for entry in all_data_entries :
label_data = entry . get ( " label " )
image_filename_prefix = entry . get ( " image " )
if not label_data or not image_filename_prefix :
continue
# Find all image files associated with this entry
prefix_stem = Path ( image_filename_prefix ) . stem
search_pattern = os . path . join ( image_root , f " { prefix_stem } * " )
found_image_paths = sorted ( glob . glob ( search_pattern ) )
if not found_image_paths :
continue
image_content_list = [
{ " type " : " image " , " image " : path } for path in found_image_paths
]
# --- Create a multi-turn conversation for each group ---
for main_field , related_fields in CONVERSATION_GROUPS . items ( ) :
# Start a conversation only if the main field exists in the label
if main_field not in label_data :
continue
conversation = [ ]
language = random . choice ( [ " english " , " french " ] )
# 1. Add the System Prompt
conversation . append ( { " role " : " system " , " content " : system_prompt } )
# 2. First User Turn (with image)
first_question = random . choice ( question_bank [ main_field ] [ language ] )
conversation . append ( {
" role " : " user " ,
2025-08-14 08:35:10 +00:00
" content " : image_content_list + [ { " type " : " text " , " text " : " <image> " * len ( found_image_paths ) + first_question } ] ,
2025-08-13 20:51:49 +00:00
} )
# 3. First Assistant Turn
first_answer = get_conversational_answer (
main_field , label_data , answer_bank , language
)
conversation . append ( { " role " : " assistant " , " content " : first_answer } )
# 4. Follow-up Turns for related fields
for follow_up_field in related_fields :
if follow_up_field in label_data :
# Follow-up User Turn (text only)
follow_up_question = random . choice ( question_bank [ follow_up_field ] [ language ] )
conversation . append ( {
" role " : " user " ,
" content " : [ { " type " : " text " , " text " : follow_up_question } ] ,
} )
# Follow-up Assistant Turn
follow_up_answer = get_conversational_answer (
follow_up_field , label_data , answer_bank , language
)
conversation . append ( { " role " : " assistant " , " content " : follow_up_answer } )
final_conversations . append ( conversation )
2025-08-08 15:07:37 +00:00
2025-08-13 20:51:49 +00:00
with open ( output_path , " w " , encoding = " utf-8 " ) as f :
json . dump ( final_conversations , f , indent = 4 , ensure_ascii = False )
print ( f " Success! Generated { len ( final_conversations ) } multi-turn VQA conversations. " )
print ( f " Formatted data saved to: { output_path } " )
2025-08-07 08:38:26 +00:00
# --- Conversations Generation for only Images ---
2025-08-08 15:07:37 +00:00
def generate_vq_question (
image_root , system_prompt_path , questions_path , output_path , ratio = 0.4
) :
2025-08-07 08:38:26 +00:00
"""
Generates conversational VQA pairs for each document based on images only ( no labels ) .
Each conversation contains a system and user message for each question in the question bank .
"""
system_prompt = read_text_file ( system_prompt_path )
question_bank = load_json ( questions_path )
if not system_prompt or not question_bank :
print ( " Could not load one or more necessary files. Exiting. " )
return
# Find all images and group by prefix
2025-08-08 15:07:37 +00:00
all_image_paths = sorted (
glob . glob ( os . path . join ( image_root , " *.jpg " ) )
+ glob . glob ( os . path . join ( image_root , " *.png " ) )
+ glob . glob ( os . path . join ( image_root , " *.jpeg " ) )
)
2025-08-07 08:38:26 +00:00
prefix_to_images = { }
for path in all_image_paths :
if not os . path . isfile ( path ) :
continue
stem = Path ( path ) . stem
# Remove suffixes like _1_scale, _2_scale, etc.
2025-08-08 15:07:37 +00:00
prefix = re . sub ( r " (_ \ d+(_scale)?)$ " , " " , stem )
2025-08-07 08:38:26 +00:00
prefix_to_images . setdefault ( prefix , [ ] ) . append ( path )
2025-08-08 14:20:33 +00:00
# Get a list of all possible fields from the question bank.
all_fields = list ( question_bank . keys ( ) )
# Determine how many questions to ask based on the available fields
num_to_sample = max ( 1 , int ( len ( all_fields ) * ratio ) )
2025-08-07 08:38:26 +00:00
final_conversations = [ ]
for prefix , image_paths in prefix_to_images . items ( ) :
2025-08-08 15:07:37 +00:00
image_content_list = [
{ " type " : " image " , " image " : path } for path in sorted ( image_paths )
]
2025-08-08 14:20:33 +00:00
# Randomly select fields to ask questions about
fields_to_ask = random . sample ( all_fields , num_to_sample )
2025-08-08 15:07:37 +00:00
2025-08-08 14:20:33 +00:00
for field in fields_to_ask :
2025-08-08 15:07:37 +00:00
language = random . choice ( [ " english " , " french " ] )
question_text = random . choice ( question_bank [ field ] [ language ] )
system_message = { " role " : " system " , " content " : system_prompt }
user_message = {
" role " : " user " ,
" content " : image_content_list
2025-08-14 08:35:10 +00:00
+ [ { " type " : " text " , " text " : " <image> " * len ( image_paths ) + question_text } ] ,
2025-08-08 15:07:37 +00:00
}
conversation = [ system_message , user_message ]
final_conversations . append ( conversation )
with open ( output_path , " w " , encoding = " utf-8 " ) as f :
2025-08-07 08:38:26 +00:00
json . dump ( final_conversations , f , indent = 4 , ensure_ascii = False )
2025-08-08 15:07:37 +00:00
print (
f " Success! Generated { len ( final_conversations ) } image-only conversational VQA entries. "
)
2025-08-07 08:38:26 +00:00
print ( f " Formatted data saved to: { output_path } " )
2025-08-08 15:07:37 +00:00
2025-08-13 20:51:49 +00:00
# --- Conversations Generation for Multi-Turn Questions (No Labels) ---
def generate_multiturn_vq_question (
image_root , system_prompt_path , questions_path , output_path
) :
"""
Generates multi - turn , question - only conversational prompts for each document .
"""
system_prompt = read_text_file ( system_prompt_path )
question_bank = load_json ( questions_path )
if not system_prompt or not question_bank :
print ( " Could not load one or more necessary files. Exiting. " )
return
# --- MODIFICATION: Define the same field groupings ---
CONVERSATION_GROUPS = {
" doctor_name " : [ " profession " , " finess_number " , " rpps_number " , " adeli_number " ] ,
" beneficiary_name " : [ " beneficiary_dob " , " security_number " ] ,
" bill_paid " : [ " mandatory_coverage " , " complementary_coverage " , " client_part " , " amount_paid " ] ,
}
# Find all images and group by prefix
all_image_paths = sorted (
glob . glob ( os . path . join ( image_root , " *.jpg " ) )
+ glob . glob ( os . path . join ( image_root , " *.png " ) )
+ glob . glob ( os . path . join ( image_root , " *.jpeg " ) )
)
prefix_to_images = { }
for path in all_image_paths :
if not os . path . isfile ( path ) :
continue
stem = Path ( path ) . stem
prefix = re . sub ( r " (_ \ d+(_scale)?)$ " , " " , stem )
prefix_to_images . setdefault ( prefix , [ ] ) . append ( path )
final_conversations = [ ]
for prefix , image_paths in prefix_to_images . items ( ) :
image_content_list = [
{ " type " : " image " , " image " : path } for path in sorted ( image_paths )
]
# --- Create a multi-turn conversation for each group ---
for main_field , related_fields in CONVERSATION_GROUPS . items ( ) :
conversation = [ ]
language = random . choice ( [ " english " , " french " ] )
# 1. Add the System Prompt
conversation . append ( { " role " : " system " , " content " : system_prompt } )
# 2. First User Turn (with image)
first_question = random . choice ( question_bank [ main_field ] [ language ] )
conversation . append ( {
" role " : " user " ,
2025-08-14 08:35:10 +00:00
" content " : image_content_list + [ { " type " : " text " , " text " : " <image> " * len ( image_paths ) + first_question } ] ,
2025-08-13 20:51:49 +00:00
} )
# 3. Follow-up User Turns (text only)
for follow_up_field in related_fields :
if follow_up_field in question_bank :
follow_up_question = random . choice ( question_bank [ follow_up_field ] [ language ] )
conversation . append ( {
" role " : " user " ,
" content " : [ { " type " : " text " , " text " : follow_up_question } ] ,
} )
final_conversations . append ( conversation )
with open ( output_path , " w " , encoding = " utf-8 " ) as f :
json . dump ( final_conversations , f , indent = 4 , ensure_ascii = False )
print ( f " Success! Generated { len ( final_conversations ) } multi-turn VQA questions. " )
print ( f " Formatted data saved to: { output_path } " )
2025-08-08 15:07:37 +00:00
2025-08-07 08:38:26 +00:00
# --- Main Execution Block ---
if __name__ == " __main__ " :
2025-08-12 14:32:40 +00:00
parser = argparse . ArgumentParser ( description = " Generate VQA conversations from label data. " )
parser . add_argument ( " --image_root " , type = str , default = " /home/nguyendc/docai_dataset/factures/distill_data/docai_mgp_facture_v2_0 " , help = " Root directory containing images. " )
parser . add_argument ( " --labels " , type = str , default = " /home/nguyendc/docai_dataset/factures/distill_data/docai_mgp_facture_v2_0/label_data.json " , help = " Path to the label data JSON file. " )
parser . add_argument ( " --system_prompt " , type = str , default = " /home/nguyendc/phong-dev/distillation/easydistill/mmkd/dev-vqa/qa_bank/unstructured_prompt.txt " , help = " Path to the system prompt text file. " )
parser . add_argument ( " --questions " , type = str , default = " /home/nguyendc/phong-dev/distill/prompt/question_bank.json " , help = " Path to the question bank JSON file. " )
parser . add_argument ( " --answers " , type = str , default = " /home/nguyendc/phong-dev/distill/prompt/answer_bank.json " , help = " Path to the answer bank JSON file. " )
2025-08-18 19:49:41 +00:00
parser . add_argument ( " --output " , type = str , default = " /home/nguyendc/phong-dev/distillation/data/vqa_label.json " , help = " Path to save the output VQA conversations JSON file. " )
2025-08-12 14:32:40 +00:00
parser . add_argument ( " --ratio " , type = float , default = 0.4 , help = " Ratio of fields to sample for questions (default: 0.4). " )
args = parser . parse_args ( )
2025-08-08 15:07:37 +00:00
2025-08-13 20:51:49 +00:00
# Single-turn, field-by-field conversations WITH labels
2025-08-18 19:49:41 +00:00
generate_vqa_conversations ( args . labels , args . image_root , args . system_prompt , args . questions , args . answers , args . output , args . ratio )
2025-08-13 20:51:49 +00:00
# Use this for multi-turn conversations WITH labels based on field groups
# generate_multiturn_conversations(args.labels, args.image_root, args.system_prompt, args.questions, args.answers, args.output)
# Use this for generating question-only prompts for unlabeled images
# generate_vq_question(args.image_root, args.system_prompt, args.questions, args.output, args.ratio)
# Use this for multi-turn question-only prompts for unlabeled images
2025-08-18 19:49:41 +00:00
# generate_multiturn_vq_question(args.image_root, args.system_prompt, args.questions, args.output)