IAM ReCTS

This commit is contained in:
echo840
2023-06-09 10:29:18 +08:00
parent 3c59897aa6
commit e22b12b169
185 changed files with 294244 additions and 22 deletions
Regular → Executable
View File
View File
Binary file not shown.
View File
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
View File
Binary file not shown.
Regular → Executable
View File
Regular → Executable
+6 -5
View File
@@ -7,35 +7,36 @@ class SROIEDataset(Dataset):
self,
dir_path= "./data/SROIE",
):
dir_path = dir_path+'/ann'
self.image_list = []
self.question_list = []
self.answer_list = []
for file_name in os.listdir(dir_path):
if file_name.endswith(".txt") and '(' not in file_name:
file_path = os.path.join(dir_path, file_name)
img_path = file_path.replace('.txt', '.jpg')
img_path = file_path.replace('.txt', '.jpg').replace('ann','image')
with open(file_path) as f:
content = f.read()
info = json.loads(content)
if 'company' in info.keys():
self.question_list.append("what is the name of the company that issued this invoice?")#llava 0.12
self.question_list.append("what is the name of the company that issued this receipt?")#llava 0.12
#self.question_list.append("what is the company information in the image?")#llava 0.08
self.answer_list.append(info['company'])
self.image_list.append(img_path)
if 'date' in info.keys():
self.question_list.append("when was this invoice issued?")
self.question_list.append("when was this receipt issued?")
#self.question_list.append("what is the date information in the image?")
self.answer_list.append(info['date'])
self.image_list.append(img_path)
if 'address' in info.keys():
self.question_list.append("where was this invoice issued?")
self.question_list.append("where was this receipt issued?")
#self.question_list.append("what is the address information in the image?")
self.answer_list.append(info['address'])
self.image_list.append(img_path)
if 'total' in info.keys():
self.question_list.append("what is the total amount of this invoice?")
self.question_list.append("what is the total amount of this receipt?")
#self.question_list.append("what is the total information in the image?")
self.answer_list.append(info['total'])
self.image_list.append(img_path)
Regular → Executable
+67 -1
View File
@@ -1,5 +1,11 @@
from torch.utils.data import Dataset
import xml.etree.ElementTree as ET
import os
import re
def remove_special_chars(s):
pattern = r"[^a-zA-Z0-9\s]"
s = re.sub(pattern, "", s)
return s
class ocrDataset(Dataset):
def __init__(
self,
@@ -19,4 +25,64 @@ class ocrDataset(Dataset):
answers = self.lines[idx].split()[1]
return {
"image_path": img_path,
"gt_answers": answers}
"gt_answers": answers}
class IAMDataset(Dataset):
def __init__(self, image_dir_path = './data/IAM') -> None:
ann_path = image_dir_path + '/xml'
self.images = []
self.answers = []
for filename in os.listdir(ann_path):
if filename.endswith('.xml'):
# 读取xml文件
xml_file = os.path.join(ann_path, filename)
tree = ET.parse(xml_file)
root = tree.getroot()
# 对读取的xml文件进行操作
# 例如,输出xml文件中的所有元素
for word in root.iter('word'):
text = word.get('text')
img_id = word.get('id')
img_path = image_dir_path+'/'+filename.split('-')[0]+'/'+filename.split('.')[0]+'/'+img_id+'.png'
text = remove_special_chars(text)
if len(text)>0:
self.images.append(img_path)
self.answers.append(text)
def __len__(self):
return len(self.images)
def __getitem__(self, idx):
img_path = self.images[idx]
answers = self.answers[idx]
return {
"image_path": img_path,
"gt_answers": answers}
class ReCTSDataset(Dataset):
def __init__(
self,
dir_path= "./data/ReCTS",
):
self.image_dir_path = os.path.join(dir_path, 'crops')
file_path = os.path.join(dir_path, 'test_label.txt')
file = open(file_path, "r")
self.lines = file.readlines()
def __len__(self):
return len(self.lines)
def __getitem__(self, idx):
image_id = self.lines[idx].split()[0]
img_path = os.path.join(self.image_dir_path, image_id)
answers = self.lines[idx].split()[1]
return {
"image_path": img_path,
"gt_answers": answers}
if __name__ == "__main__":
'''data = IAMDataset('/home/zhangli/GPT4/MutimodelOCR/data/IAM')
print(len(data))
data = iter(data)
batch = next(data)
import pdb;pdb.set_trace()'''
data = ReCTSDataset('/home/zhangli/GPT4/MutimodelOCR/data/ReCTS')
print(len(data))
data = iter(data)
batch = next(data)
print(batch)
+2 -2
View File
@@ -5,8 +5,8 @@ def has_chinese_characters(string):
pattern = re.compile(r'[\u4e00-\u9fa5]')
return bool(pattern.search(string))
if __name__ == "__main__":
ann_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/annotations/train.json"
img_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/images/train"
ann_file = "/home/zhangli/OCRData/data/TextVQA/ESTVQA/annotations/train.json"
#img_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/images/train"
cn_list = []
en_list= []
with open(ann_file,'r') as f:
View File
+23
View File
@@ -0,0 +1,23 @@
import re
import os
def has_chinese_characters(string):
pattern = re.compile(r'[\u4e00-\u9fa5]')
return bool(pattern.search(string))
def is_all_chinese(text):
"""
判断一个字符串是否仅仅包含中文
"""
pattern = re.compile(r'^[\u4e00-\u9fa5]+$')
return pattern.match(text) is not None
if __name__ =='__main__':
file_path = "/home/zhangli/GPT4/MutimodelOCR/data/ReCTS/annotation.txt"
out = open("/home/zhangli/GPT4/MutimodelOCR/data/ReCTS/ann.txt",'w')
with open(file_path, 'r') as file:
data = file.readlines()
for line in data:
text = line.strip().split()[1]
path = os.path.join("/home/zhangli/GPT4/MutimodelOCR/data/ReCTS/crops",line.strip().split()[0])
if is_all_chinese(text) and os.path.exists(path):
out.write(line.strip())
out.write('\n')
out.close()
Regular → Executable
View File