25 lines
1.3 KiB
Python
Executable File
25 lines
1.3 KiB
Python
Executable File
import json
|
|
import os
|
|
if __name__ == "__main__":
|
|
ann_dir_path = '/home/zhangli/GPT4/MutimodelOCR/data/FUNSD/training_data/annotations'
|
|
questions = []
|
|
answers = []
|
|
for file_name in os.listdir(ann_dir_path):
|
|
file_path = os.path.join(ann_dir_path, file_name)
|
|
with open(file_path, 'r') as f:
|
|
json_data = json.load(f)['form']
|
|
#去除空的linking
|
|
json_data = [d for d in json_data if "linking" in d and len(d["linking"])>0]
|
|
question_list = [d for d in json_data if d.get('label') == 'question']
|
|
answer_list = [d for d in json_data if d.get('label') == 'answer']
|
|
unique_question_list = [d for i, d in enumerate(question_list) if d['text'] not in [x['text'] for x in json_data[:i]]]
|
|
for i in range(len(unique_question_list)):
|
|
link = unique_question_list[i]['linking']
|
|
gt_answer = ""
|
|
for j in range(len(link)):
|
|
for k in range(len(answer_list)):
|
|
if answer_list[k]['id'] == link[j][1]:
|
|
gt_answer = gt_answer + answer_list[k]['text']
|
|
questions.append(unique_question_list[i]['text'])
|
|
answers.append(gt_answer)
|
|
|