add

2023-05-23 18:24:16 +08:00
parent da758a9ca7
commit b388fba03e
470 changed files with 2523750 additions and 7307 deletions
--- a/datasets/pycache/formula_dataset.cpython-310.pyc
+++ b/datasets/pycache/formula_dataset.cpython-310.pyc
--- a/datasets/pycache/kie_dataset.cpython-310.pyc
+++ b/datasets/pycache/kie_dataset.cpython-310.pyc
--- a/datasets/pycache/ocr_dataset.cpython-310.pyc
+++ b/datasets/pycache/ocr_dataset.cpython-310.pyc
--- a/datasets/pycache/vqa_dataset.cpython-310.pyc
+++ b/datasets/pycache/vqa_dataset.cpython-310.pyc
--- a/datasets/formula_dataset.py
+++ b/datasets/formula_dataset.py
@@ -0,0 +1,24 @@
+from torch.utils.data import Dataset
+import os
+class HMEDataset(Dataset):
+    def __init__(
+        self,
+        image_dir_path= "./data/HME100K/test_images",
+        ann_path= "./data/HME100K/test_labels.txt",
+    ):
+        file = open(ann_path, "r")
+        self.lines = file.readlines()
+        self.image_dir_path = image_dir_path
+    def __len__(self):
+        return len(self.lines)
+    def __getitem__(self, idx):
+        image_id = self.lines[idx].split("\t")[0]
+        img_path = os.path.join(self.image_dir_path, image_id)
+        answers = self.lines[idx].split("\t")[1]
+        return {
+            "image_path": img_path,
+            "gt_answers": answers}
+if __name__ == "__main__":
+    data = HMEDataset("/home/zhangli/GPT4/MutimodelOCR/data/HME100K/test_images","/home/zhangli/GPT4/MutimodelOCR/data/HME100K/test_labels.txt")
+    data = iter(data)
+    batch = next(data)
--- a/datasets/kie_dataset.py
+++ b/datasets/kie_dataset.py
@@ -0,0 +1,123 @@
+from torch.utils.data import Dataset
+import os
+import json
+class SROIEDataset(Dataset):
+    def __init__(
+        self,
+        dir_path= "./data/SROIE",
+    ):
+        self.image_list = []
+        self.question_list = []
+        self.answer_list = []
+        for file_name in os.listdir(dir_path):
+            if file_name.endswith(".txt") and '(' not in file_name:
+                file_path = os.path.join(dir_path, file_name)
+                img_path = file_path.replace('.txt', '.jpg')
+                with open(file_path) as f:
+                    content = f.read()
+                    info = json.loads(content)
+                    if 'company' in info.keys():
+                        self.question_list.append("what is the name of the company that issued this invoice?")#llava 0.12
+                        #self.question_list.append("what is the company information in the image?")#llava 0.08
+                        self.answer_list.append(info['company'])
+                        self.image_list.append(img_path)
+                    if 'date' in info.keys():
+                        self.question_list.append("when was this invoice issued?")
+                        #self.question_list.append("what is the date information in the image?")
+                        self.answer_list.append(info['date'])
+                        self.image_list.append(img_path)
+
+                    if 'address' in info.keys():
+                        self.question_list.append("where was this invoice issued?")
+                        #self.question_list.append("what is the address information in the image?")
+                        self.answer_list.append(info['address'])
+                        self.image_list.append(img_path)
+
+                    if 'total' in info.keys():
+                        self.question_list.append("what is the total amount of this invoice?")
+                        #self.question_list.append("what is the total information in the image?")
+                        self.answer_list.append(info['total'])
+                        self.image_list.append(img_path)
+    def __len__(self):
+        return len(self.image_list)
+    def __getitem__(self, idx):
+        img_path = self.image_list[idx]
+        question = self.question_list[idx]
+        answers = self.answer_list[idx]
+        return {
+            "image_path": img_path,
+            "question": question,
+            "gt_answers": answers}
+        
+class FUNSDDataset(Dataset):
+    def __init__(self, ann_dir_path= "./data/FUNSD/testing_data/annotations"):
+        questions = []
+        answers = []
+        images = []
+        for file_name in os.listdir(ann_dir_path):
+            file_path = os.path.join(ann_dir_path, file_name)
+            with open(file_path, 'r') as f:
+                json_data = json.load(f)['form']
+                #去除空的linking
+                json_data = [d for d in json_data if "linking" in d and len(d["linking"])>0]
+                question_list = [d for d in json_data if d.get('label') == 'question']
+                answer_list = [d for d in json_data if d.get('label') == 'answer']
+                
+                for i in range(len(question_list)):
+                    link = question_list[i]['linking']
+                    gt_answer = ""
+                    for j in range(len(link)):
+                        for k in range(len(answer_list)):
+                            if answer_list[k]['id'] == link[j][1]:
+                                if len(gt_answer)>0:
+                                    gt_answer = gt_answer + ' ' + answer_list[k]['text']
+                                else:
+                                    gt_answer = gt_answer + answer_list[k]['text']
+                    if len(gt_answer)>0:
+                        questions.append(f"what is \"{question_list[i]['text']}\" information in the image?")
+                        answers.append(gt_answer)
+                        images.append(file_path.replace('annotations','images').replace('.json','.png'))
+        self.questions = questions
+        self.answers = answers
+        self.images = images
+    def __len__(self):
+        return len(self.questions)
+    def __getitem__(self, idx):
+        img_path = self.images[idx]
+        question = self.questions[idx]
+        answers = self.answers[idx]
+        return {
+            "image_path": img_path,
+            "question": question,
+            "gt_answers": answers}
+class POIEDataset(Dataset):
+    def __init__(
+        self,
+        dir_path= "./data/POIE/test.txt",
+    ):
+        self.image_list = []
+        self.question_list = []
+        self.answer_list = []
+        with open(dir_path, 'r') as f:
+            lines = f.readlines()
+            for line in lines:
+                import pdb;pdb.set_trace()
+                dict = json.loads(line)
+                for key, value in dict['entity_dict'].items():
+                    self.image_list.append(dir_path.replace("test.txt", dict['file_name']))
+                    self.question_list.append(key)
+                    self.answer_list.append(value)
+    def __len__(self):
+        return len(self.image_list)
+    def __getitem__(self, idx):
+        img_path = self.image_list[idx]
+        question = self.question_list[idx]
+        answers = self.answer_list[idx]
+        return {
+            "image_path": img_path,
+            "question": question,
+            "gt_answers": answers}
+if __name__ == "__main__":
+    data = POIEDataset("/home/zhangli/GPT4/MutimodelOCR/data/POIE/test.txt")
+    data = iter(data)
+    batch = next(data)
--- a/datasets/process/process_ESTVQA.py
+++ b/datasets/process/process_ESTVQA.py
@@ -0,0 +1,23 @@
+import json
+import os
+import re
+def has_chinese_characters(string):
+    pattern = re.compile(r'[\u4e00-\u9fa5]')
+    return bool(pattern.search(string))
+if __name__ == "__main__":
+    ann_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/annotations/train.json"
+    img_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/images/train"
+    cn_list = []
+    en_list= []
+    with open(ann_file,'r') as f:
+        data = json.load(f)
+        for i in range(len(data)):
+            if has_chinese_characters(data[i]['annotation'][0]['question']):
+                cn_list.append(data[i])
+            else:
+                en_list.append(data[i])
+    with open('./cn_train.json', 'w', encoding='utf-8') as f:
+        json.dump(cn_list, f, ensure_ascii=False)
+    with open('./en_train.json', 'w', encoding='utf-8') as f:
+        json.dump(en_list, f, ensure_ascii=False)
+    
--- a/datasets/process/process_FUNSD.py
+++ b/datasets/process/process_FUNSD.py
@@ -0,0 +1,25 @@
+import json
+import os
+if __name__ == "__main__":
+    ann_dir_path = '/home/zhangli/GPT4/MutimodelOCR/data/FUNSD/training_data/annotations'
+    questions = []
+    answers = []
+    for file_name in os.listdir(ann_dir_path):
+        file_path = os.path.join(ann_dir_path, file_name)
+        with open(file_path, 'r') as f:
+            json_data = json.load(f)['form']
+            #去除空的linking
+            json_data = [d for d in json_data if "linking" in d and len(d["linking"])>0]
+            question_list = [d for d in json_data if d.get('label') == 'question']
+            answer_list = [d for d in json_data if d.get('label') == 'answer']
+            unique_question_list = [d for i, d in enumerate(question_list) if d['text'] not in [x['text'] for x in json_data[:i]]]
+            for i in range(len(unique_question_list)):
+                link = unique_question_list[i]['linking']
+                gt_answer = ""
+                for j in range(len(link)):
+                    for k in range(len(answer_list)):
+                        if answer_list[k]['id'] == link[j][1]:
+                            gt_answer = gt_answer + answer_list[k]['text']
+                questions.append(unique_question_list[i]['text'])
+                answers.append(gt_answer)
+                    
--- a/datasets/vqa_dataset.py
+++ b/datasets/vqa_dataset.py
@@ -54,7 +54,6 @@ class ocrVQADataset(Dataset):
        self.question_list = []
        self.answer_list = []
        dataset = json.load(open(ann_path, "r"))
-        import pdb;pdb.set_trace()
        for idx, data in enumerate(dataset):
            questions =  dataset[data]['questions']
            for index, question in enumerate(questions):
@@ -64,7 +63,7 @@ class ocrVQADataset(Dataset):
                self.answer_list.append(gt_answers)
                self.question_list.append(question)
    def __len__(self):
-        return len(self.data)
+        return len(self.image_list)

    def __getitem__(self, idx):
        question = self.question_list[idx]
@@ -85,13 +84,43 @@ class STVQADataset(Dataset):
        self.question_list = []
        self.answer_list = []
        data = json.load(open(ann_path, "r"))
-        for i in range(len(data)):
+        for i in range(len(data['data'])):
            image_path = image_dir_path+'/'+data['data'][i]['dataset']+'/'+data['data'][i]['file_name']
            self.image_list.append(image_path)
            self.answer_list.append(data['data'][i]['answers'])
            self.question_list.append(data['data'][i]['question'])
    def __len__(self):
-        return len(self.data)
+        return len(self.image_list)
+
+    def __getitem__(self, idx):
+        question = self.question_list[idx]
+        answers = self.answer_list[idx]
+        img_path = self.image_list[idx]
+        return {
+            "image_path": img_path,
+            "question": question,
+            "gt_answers": answers}
+class ESTVQADataset(Dataset):
+    def __init__(
+        self,
+        image_dir_path= "./data/ESTVQA/images/train",
+        ann_path= "./data/ESTVQA/annotations/train.json",
+    ):
+        self.image_list = []
+        self.question_list = []
+        self.answer_list = []
+        with open(ann_path,'r') as f:
+            data = json.load(f)
+            for i in range(len(data)):
+                image_path = os.path.join(image_dir_path, data[i]['image'])
+                for j in range(len(data[i]['annotation'])):
+                    question = data[i]['annotation'][j]['question']
+                    answer = data[i]['annotation'][j]['answer']
+                    self.image_list.append(image_path)
+                    self.question_list.append(question)
+                    self.answer_list.append(answer)
+    def __len__(self):
+        return len(self.image_list)

    def __getitem__(self, idx):
        question = self.question_list[idx]