add
This commit is contained in:
23
datasets/process/process_ESTVQA.py
Normal file
23
datasets/process/process_ESTVQA.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
def has_chinese_characters(string):
|
||||
pattern = re.compile(r'[\u4e00-\u9fa5]')
|
||||
return bool(pattern.search(string))
|
||||
if __name__ == "__main__":
|
||||
ann_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/annotations/train.json"
|
||||
img_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/images/train"
|
||||
cn_list = []
|
||||
en_list= []
|
||||
with open(ann_file,'r') as f:
|
||||
data = json.load(f)
|
||||
for i in range(len(data)):
|
||||
if has_chinese_characters(data[i]['annotation'][0]['question']):
|
||||
cn_list.append(data[i])
|
||||
else:
|
||||
en_list.append(data[i])
|
||||
with open('./cn_train.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(cn_list, f, ensure_ascii=False)
|
||||
with open('./en_train.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(en_list, f, ensure_ascii=False)
|
||||
|
25
datasets/process/process_FUNSD.py
Normal file
25
datasets/process/process_FUNSD.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import json
|
||||
import os
|
||||
if __name__ == "__main__":
|
||||
ann_dir_path = '/home/zhangli/GPT4/MutimodelOCR/data/FUNSD/training_data/annotations'
|
||||
questions = []
|
||||
answers = []
|
||||
for file_name in os.listdir(ann_dir_path):
|
||||
file_path = os.path.join(ann_dir_path, file_name)
|
||||
with open(file_path, 'r') as f:
|
||||
json_data = json.load(f)['form']
|
||||
#去除空的linking
|
||||
json_data = [d for d in json_data if "linking" in d and len(d["linking"])>0]
|
||||
question_list = [d for d in json_data if d.get('label') == 'question']
|
||||
answer_list = [d for d in json_data if d.get('label') == 'answer']
|
||||
unique_question_list = [d for i, d in enumerate(question_list) if d['text'] not in [x['text'] for x in json_data[:i]]]
|
||||
for i in range(len(unique_question_list)):
|
||||
link = unique_question_list[i]['linking']
|
||||
gt_answer = ""
|
||||
for j in range(len(link)):
|
||||
for k in range(len(answer_list)):
|
||||
if answer_list[k]['id'] == link[j][1]:
|
||||
gt_answer = gt_answer + answer_list[k]['text']
|
||||
questions.append(unique_question_list[i]['text'])
|
||||
answers.append(gt_answer)
|
||||
|
Reference in New Issue
Block a user