IAM ReCTS
This commit is contained in:
4
datasets/process/process_ESTVQA.py
Normal file → Executable file
4
datasets/process/process_ESTVQA.py
Normal file → Executable file
@@ -5,8 +5,8 @@ def has_chinese_characters(string):
|
||||
pattern = re.compile(r'[\u4e00-\u9fa5]')
|
||||
return bool(pattern.search(string))
|
||||
if __name__ == "__main__":
|
||||
ann_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/annotations/train.json"
|
||||
img_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/images/train"
|
||||
ann_file = "/home/zhangli/OCRData/data/TextVQA/ESTVQA/annotations/train.json"
|
||||
#img_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/images/train"
|
||||
cn_list = []
|
||||
en_list= []
|
||||
with open(ann_file,'r') as f:
|
||||
|
0
datasets/process/process_FUNSD.py
Normal file → Executable file
0
datasets/process/process_FUNSD.py
Normal file → Executable file
23
datasets/process/process_ReCTS.py
Normal file
23
datasets/process/process_ReCTS.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import re
|
||||
import os
|
||||
def has_chinese_characters(string):
|
||||
pattern = re.compile(r'[\u4e00-\u9fa5]')
|
||||
return bool(pattern.search(string))
|
||||
def is_all_chinese(text):
|
||||
"""
|
||||
判断一个字符串是否仅仅包含中文
|
||||
"""
|
||||
pattern = re.compile(r'^[\u4e00-\u9fa5]+$')
|
||||
return pattern.match(text) is not None
|
||||
if __name__ =='__main__':
|
||||
file_path = "/home/zhangli/GPT4/MutimodelOCR/data/ReCTS/annotation.txt"
|
||||
out = open("/home/zhangli/GPT4/MutimodelOCR/data/ReCTS/ann.txt",'w')
|
||||
with open(file_path, 'r') as file:
|
||||
data = file.readlines()
|
||||
for line in data:
|
||||
text = line.strip().split()[1]
|
||||
path = os.path.join("/home/zhangli/GPT4/MutimodelOCR/data/ReCTS/crops",line.strip().split()[0])
|
||||
if is_all_chinese(text) and os.path.exists(path):
|
||||
out.write(line.strip())
|
||||
out.write('\n')
|
||||
out.close()
|
Reference in New Issue
Block a user