IAM ReCTS

2023-06-09 10:29:18 +08:00
parent 3c59897aa6
commit e22b12b169
185 changed files with 294244 additions and 22 deletions
--- a/datasets/init.py
+++ b/datasets/init.py
--- a/datasets/pycache/init.cpython-310.pyc
+++ b/datasets/pycache/init.cpython-310.pyc
--- a/datasets/pycache/init.cpython-38.pyc
+++ b/datasets/pycache/init.cpython-38.pyc
--- a/datasets/pycache/formula_dataset.cpython-310.pyc
+++ b/datasets/pycache/formula_dataset.cpython-310.pyc
--- a/datasets/pycache/formula_dataset.cpython-38.pyc
+++ b/datasets/pycache/formula_dataset.cpython-38.pyc
--- a/datasets/pycache/kie_dataset.cpython-310.pyc
+++ b/datasets/pycache/kie_dataset.cpython-310.pyc
--- a/datasets/pycache/kie_dataset.cpython-38.pyc
+++ b/datasets/pycache/kie_dataset.cpython-38.pyc
--- a/datasets/pycache/ocr_dataset.cpython-310.pyc
+++ b/datasets/pycache/ocr_dataset.cpython-310.pyc
--- a/datasets/pycache/ocr_dataset.cpython-38.pyc
+++ b/datasets/pycache/ocr_dataset.cpython-38.pyc
--- a/datasets/pycache/vqa_dataset.cpython-310.pyc
+++ b/datasets/pycache/vqa_dataset.cpython-310.pyc
--- a/datasets/pycache/vqa_dataset.cpython-38.pyc
+++ b/datasets/pycache/vqa_dataset.cpython-38.pyc
--- a/datasets/formula_dataset.py
+++ b/datasets/formula_dataset.py
--- a/datasets/kie_dataset.py
+++ b/datasets/kie_dataset.py
@@ -7,35 +7,36 @@ class SROIEDataset(Dataset):
        self,
        dir_path= "./data/SROIE",
    ):
+        dir_path = dir_path+'/ann'
        self.image_list = []
        self.question_list = []
        self.answer_list = []
        for file_name in os.listdir(dir_path):
            if file_name.endswith(".txt") and '(' not in file_name:
                file_path = os.path.join(dir_path, file_name)
-                img_path = file_path.replace('.txt', '.jpg')
+                img_path = file_path.replace('.txt', '.jpg').replace('ann','image')
                with open(file_path) as f:
                    content = f.read()
                    info = json.loads(content)
                    if 'company' in info.keys():
-                        self.question_list.append("what is the name of the company that issued this invoice?")#llava 0.12
+                        self.question_list.append("what is the name of the company that issued this receipt?")#llava 0.12
                        #self.question_list.append("what is the company information in the image?")#llava 0.08
                        self.answer_list.append(info['company'])
                        self.image_list.append(img_path)
                    if 'date' in info.keys():
-                        self.question_list.append("when was this invoice issued?")
+                        self.question_list.append("when was this receipt issued?")
                        #self.question_list.append("what is the date information in the image?")
                        self.answer_list.append(info['date'])
                        self.image_list.append(img_path)

                    if 'address' in info.keys():
-                        self.question_list.append("where was this invoice issued?")
+                        self.question_list.append("where was this receipt issued?")
                        #self.question_list.append("what is the address information in the image?")
                        self.answer_list.append(info['address'])
                        self.image_list.append(img_path)

                    if 'total' in info.keys():
-                        self.question_list.append("what is the total amount of this invoice?")
+                        self.question_list.append("what is the total amount of this receipt?")
                        #self.question_list.append("what is the total information in the image?")
                        self.answer_list.append(info['total'])
                        self.image_list.append(img_path)
--- a/datasets/ocr_dataset.py
+++ b/datasets/ocr_dataset.py
@@ -1,5 +1,11 @@
 from torch.utils.data import Dataset
+import xml.etree.ElementTree as ET
 import os
+import re
+def remove_special_chars(s):
+    pattern = r"[^a-zA-Z0-9\s]"
+    s = re.sub(pattern, "", s)
+    return s
 class ocrDataset(Dataset):
    def __init__(
        self,
@@ -19,4 +25,64 @@ class ocrDataset(Dataset):
        answers = self.lines[idx].split()[1]
        return {
            "image_path": img_path,
-            "gt_answers": answers}
+            "gt_answers": answers}
+class IAMDataset(Dataset):
+    def __init__(self, image_dir_path = './data/IAM') -> None:
+        ann_path = image_dir_path + '/xml'
+        self.images = []
+        self.answers = []
+        for filename in os.listdir(ann_path):
+            if filename.endswith('.xml'):
+                # 读取xml文件
+                xml_file = os.path.join(ann_path, filename)
+                tree = ET.parse(xml_file)
+                root = tree.getroot()
+                # 对读取的xml文件进行操作
+                # 例如，输出xml文件中的所有元素
+                for word in root.iter('word'):
+                    text = word.get('text')
+                    img_id = word.get('id')
+                    img_path = image_dir_path+'/'+filename.split('-')[0]+'/'+filename.split('.')[0]+'/'+img_id+'.png'
+                    text = remove_special_chars(text)
+                    if len(text)>0:
+                        self.images.append(img_path)
+                        self.answers.append(text)
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, idx):
+        img_path = self.images[idx]
+        answers = self.answers[idx]
+        return {
+            "image_path": img_path,
+            "gt_answers": answers}
+class ReCTSDataset(Dataset):
+    def __init__(
+        self,
+        dir_path= "./data/ReCTS",
+    ):
+        self.image_dir_path = os.path.join(dir_path, 'crops')
+        file_path = os.path.join(dir_path, 'test_label.txt')
+        file = open(file_path, "r")
+        self.lines = file.readlines()
+    def __len__(self):
+        return len(self.lines)
+    def __getitem__(self, idx):
+        image_id = self.lines[idx].split()[0]
+        img_path = os.path.join(self.image_dir_path, image_id)
+        answers = self.lines[idx].split()[1]
+        return {
+            "image_path": img_path,
+            "gt_answers": answers}
+if __name__ == "__main__":
+    '''data = IAMDataset('/home/zhangli/GPT4/MutimodelOCR/data/IAM')
+    print(len(data))
+    data = iter(data)
+    batch = next(data)
+    import pdb;pdb.set_trace()'''
+    data = ReCTSDataset('/home/zhangli/GPT4/MutimodelOCR/data/ReCTS')
+    print(len(data))
+    data = iter(data)
+    batch = next(data)
+    print(batch)
+
+
--- a/datasets/process/process_ESTVQA.py
+++ b/datasets/process/process_ESTVQA.py
@@ -5,8 +5,8 @@ def has_chinese_characters(string):
    pattern = re.compile(r'[\u4e00-\u9fa5]')
    return bool(pattern.search(string))
 if __name__ == "__main__":
-    ann_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/annotations/train.json"
-    img_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/images/train"
+    ann_file = "/home/zhangli/OCRData/data/TextVQA/ESTVQA/annotations/train.json"
+    #img_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/images/train"
    cn_list = []
    en_list= []
    with open(ann_file,'r') as f:
--- a/datasets/process/process_FUNSD.py
+++ b/datasets/process/process_FUNSD.py
--- a/datasets/process/process_ReCTS.py
+++ b/datasets/process/process_ReCTS.py
@@ -0,0 +1,23 @@
+import re
+import os
+def has_chinese_characters(string):
+    pattern = re.compile(r'[\u4e00-\u9fa5]')
+    return bool(pattern.search(string))
+def is_all_chinese(text):
+    """
+    判断一个字符串是否仅仅包含中文
+    """
+    pattern = re.compile(r'^[\u4e00-\u9fa5]+$')
+    return pattern.match(text) is not None
+if __name__ =='__main__':
+    file_path = "/home/zhangli/GPT4/MutimodelOCR/data/ReCTS/annotation.txt"
+    out = open("/home/zhangli/GPT4/MutimodelOCR/data/ReCTS/ann.txt",'w')
+    with open(file_path, 'r') as file:
+        data = file.readlines()
+        for line in data:
+            text = line.strip().split()[1]
+            path = os.path.join("/home/zhangli/GPT4/MutimodelOCR/data/ReCTS/crops",line.strip().split()[0])
+            if is_all_chinese(text) and os.path.exists(path):
+                out.write(line.strip())
+                out.write('\n')
+    out.close()
--- a/datasets/vqa_dataset.py
+++ b/datasets/vqa_dataset.py