feat: add function to convert training data into conversation format

2025-09-03 15:16:44 +00:00
parent 2fc34e192a
commit 4aefd9c10a
1 changed files with 159 additions and 0 deletions
--- a/easydistill/mmkd/convert_conversation_json.py
+++ b/easydistill/mmkd/convert_conversation_json.py
@@ -0,0 +1,159 @@
+import json
+import uuid
+from typing import List, Dict
+import argparse
+import shutil
+
+# ---------------------------
+# Helper functions
+# ---------------------------
+
+def load_json_data(filepath: str):
+    """Load JSON file from disk."""
+    with open(filepath, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+def get_img_path(data: List[dict]):
+    """Extract image paths from a single record."""
+    return [item["image"] for item in data[1].get("content", []) if item.get("type") == "image"]
+
+# def get_conversations(data: List[dict]):
+#     """Extract conversations in desired format."""
+#     conversation_data = []
+#     for item in data:
+#         if item.get("role") == "system":
+#             conversation_data.append({"from": "system", "value": item.get("content")})
+#         elif item.get("role") == "user":
+#             texts = [x["text"] for x in item.get("content", []) if x.get("type") == "text"][0]
+#             conversation_data.append({"from": "human", "value": texts})
+#         elif item.get("role") == "assistant" or item.get("role") == "assistant_gt":
+#             #texts = [x["text"] for x in item.get("content", []) if x.get("type") == "text"][0]
+#             #conversation_data.append({"from": "gpt", "value": texts})
+#             conversation_data.append({"from": "gpt", "value": item.get("content")})
+#     return conversation_data
+
+def get_conversations_v2(data: List[dict]) -> List[Dict[str, str]]:
+    """Extract conversations in desired format, handling multiple text items."""
+    conversation_data = []
+    
+    for item in data: 
+        role = item.get("role")
+        
+        if role == "system":
+            conversation_data.append({"from": "system", "value": item.get("content")})
+            
+        elif role == "user":
+            texts = [x["text"] for x in item.get("content", []) if x.get("type") == "text"]
+
+            if texts: 
+                conversation_data.append({"from": "human", "value": texts[0]})
+                
+        elif role in ["assistant", "assistant_gt"]:
+            content = item.get("content")
+
+            if isinstance(content, list): # list of dicts
+                texts = [x["text"] for x in content if x.get("type") == "text"]
+                if texts:
+                    conversation_data.append({"from": "gpt", "value": texts[0]})
+            
+            elif isinstance(content, str): # single string
+                conversation_data.append({"from": "gpt", "value": content})
+            
+            else: # raw content
+                conversation_data.append({"from": "gpt", "value": str(content)})
+    return conversation_data
+
+def convert(images: List[str] = [], conversation: List[Dict[str,str]] = []):
+    """Convert raw data into docai_mgp_facture_data instance."""
+    new_data = docai_mgp_facture_data()
+    new_data.id = str(uuid.uuid4())
+    new_data.images["images"] = images
+    new_data.conversations["conversations"] = conversation
+    return new_data
+
+# ---------------------------
+# Data class
+# ---------------------------
+
+class docai_mgp_facture_data:
+    id: str
+    images: Dict[str, List[str]]
+    conversations: Dict[str, List[Dict[str,str]]]
+
+    def __init__(self):
+        self.id = ""
+        self.images = {"images": []}
+        self.conversations = {"conversations": [{"from": "", "value": ""}]}
+
+    def display_data(self):
+        print("Current data in instance:")
+        print(f"ID: {self.id}")
+        print("Images:")
+        for img in self.images.get("images", []):
+            print(f" - {img}")
+        print("Conversations:")
+        for conv in self.conversations.get("conversations", []):
+            print(f" - from: {conv.get('from')}, value: {conv.get('value')}")
+
+    def write_to_json(self, filename: str):
+        """Write the current instance data to a JSON file (overwrite)."""
+        data_dict = {
+            "id": self.id,
+            "images": self.images["images"],
+            "conversations": self.conversations["conversations"]
+        }
+        with open(filename, "w", encoding="utf-8") as f:
+            json.dump(data_dict, f, ensure_ascii=False, indent=4)
+        print(f"Data written to {filename}")
+
+
+def main() -> None:
+    '''
+    Input: one or more JSON files path
+    Output: one JSON file under conversation format
+    
+    Ex: python3 ../convert_conversation_json.py \
+                --source_path data1.json data2.json ... \
+                --destination_path dest_path.json
+    '''
+    parser = argparse.ArgumentParser(description="Convert one or more JSON files to conversation-form JSON.")
+    parser.add_argument(
+        "--source_path",
+        type=str,
+        nargs='+',  # allow multiple files
+        required=True,
+        help="Path(s) to the source JSON file."
+    )
+    parser.add_argument(
+        "--destination_path",
+        type=str,
+        required=True,
+        help="Path to the destination JSON file."
+    )
+    args = parser.parse_args()
+
+    all_data = []
+
+    for source_path in args.source_path:  # match the argument name
+        source_data = load_json_data(source_path)
+        for record_data in source_data:
+            images = get_img_path(record_data)
+            conversations = get_conversations_v2(record_data)
+            record = convert(images=images, conversation=conversations)
+            all_data.append({
+                "id": record.id,
+                "images": record.images["images"],
+                "conversations": record.conversations["conversations"]
+            })
+
+    with open(args.destination_path, "w", encoding="utf-8") as f:
+        json.dump(all_data, f, ensure_ascii=False, indent=4)
+
+    print(f"✅ All data from {len(args.source_path)} file(s) saved to {args.destination_path}")
+
+# ---------------------------
+# Main script
+# ---------------------------
+
+if __name__ == "__main__":
+    main()