23 lines
896 B
Python
23 lines
896 B
Python
![]() |
import re
|
||
|
import os
|
||
|
def has_chinese_characters(string):
|
||
|
pattern = re.compile(r'[\u4e00-\u9fa5]')
|
||
|
return bool(pattern.search(string))
|
||
|
def is_all_chinese(text):
|
||
|
"""
|
||
|
判断一个字符串是否仅仅包含中文
|
||
|
"""
|
||
|
pattern = re.compile(r'^[\u4e00-\u9fa5]+$')
|
||
|
return pattern.match(text) is not None
|
||
|
if __name__ =='__main__':
|
||
|
file_path = "/home/zhangli/GPT4/MutimodelOCR/data/ReCTS/annotation.txt"
|
||
|
out = open("/home/zhangli/GPT4/MutimodelOCR/data/ReCTS/ann.txt",'w')
|
||
|
with open(file_path, 'r') as file:
|
||
|
data = file.readlines()
|
||
|
for line in data:
|
||
|
text = line.strip().split()[1]
|
||
|
path = os.path.join("/home/zhangli/GPT4/MutimodelOCR/data/ReCTS/crops",line.strip().split()[0])
|
||
|
if is_all_chinese(text) and os.path.exists(path):
|
||
|
out.write(line.strip())
|
||
|
out.write('\n')
|
||
|
out.close()
|