24 lines
872 B
Python
24 lines
872 B
Python
![]() |
import json
|
||
|
import os
|
||
|
import re
|
||
|
def has_chinese_characters(string):
|
||
|
pattern = re.compile(r'[\u4e00-\u9fa5]')
|
||
|
return bool(pattern.search(string))
|
||
|
if __name__ == "__main__":
|
||
|
ann_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/annotations/train.json"
|
||
|
img_file = "/home/zhangli/GPT4/MutimodelOCR/data/ESTVQA/images/train"
|
||
|
cn_list = []
|
||
|
en_list= []
|
||
|
with open(ann_file,'r') as f:
|
||
|
data = json.load(f)
|
||
|
for i in range(len(data)):
|
||
|
if has_chinese_characters(data[i]['annotation'][0]['question']):
|
||
|
cn_list.append(data[i])
|
||
|
else:
|
||
|
en_list.append(data[i])
|
||
|
with open('./cn_train.json', 'w', encoding='utf-8') as f:
|
||
|
json.dump(cn_list, f, ensure_ascii=False)
|
||
|
with open('./en_train.json', 'w', encoding='utf-8') as f:
|
||
|
json.dump(en_list, f, ensure_ascii=False)
|
||
|
|