This commit is contained in:
echo840
2023-05-23 18:24:16 +08:00
parent da758a9ca7
commit b388fba03e
470 changed files with 2523750 additions and 7307 deletions

View File

@@ -0,0 +1,111 @@
import argparse
import json
import os
import openai
import tqdm
import ray
import time
@ray.remote(num_cpus=4)
def get_eval(content: str, max_tokens: int):
while True:
try:
response = openai.ChatCompletion.create(
model='gpt-4',
messages=[{
'role': 'system',
'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
}, {
'role': 'user',
'content': content,
}],
temperature=0.2, # TODO: figure out which temperature is best for evaluation
max_tokens=max_tokens,
)
break
except openai.error.RateLimitError:
pass
except Exception as e:
print(e)
time.sleep(1)
print('success!')
return response['choices'][0]['message']['content']
def parse_score(review):
try:
score_pair = review.split('\n')[0]
score_pair = score_pair.replace(',', ' ')
sp = score_pair.split(' ')
if len(sp) == 2:
return [float(sp[0]), float(sp[1])]
else:
print('error', review)
return [-1, -1]
except Exception as e:
print(e)
print('error', review)
return [-1, -1]
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
parser.add_argument('-q', '--question')
# parser.add_argument('-a', '--answer')
parser.add_argument('-a', '--answer-list', nargs='+', default=[])
parser.add_argument('-r', '--rule')
parser.add_argument('-o', '--output')
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
args = parser.parse_args()
ray.init()
f_q = open(os.path.expanduser(args.question))
f_ans1 = open(os.path.expanduser(args.answer_list[0]))
f_ans2 = open(os.path.expanduser(args.answer_list[1]))
rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
review_file = open(f'{args.output}', 'w')
js_list = []
handles = []
idx = 0
for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
# if idx == 1:
# break
ques = json.loads(ques_js)
ans1 = json.loads(ans1_js)
ans2 = json.loads(ans2_js)
category = json.loads(ques_js)['category']
if category in rule_dict:
rule = rule_dict[category]
else:
rule = rule_dict['default']
prompt = rule['prompt']
role = rule['role']
content = (f'[Question]\n{ques["text"]}\n\n'
f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
f'[System]\n{prompt}\n\n')
js_list.append({
'id': idx+1,
'question_id': ques['question_id'],
'answer1_id': ans1['answer_id'],
'answer2_id': ans2['answer_id'],
'category': category})
idx += 1
handles.append(get_eval.remote(content, args.max_tokens))
# To avoid the rate limit set by OpenAI
time.sleep(1)
reviews = ray.get(handles)
for idx, review in enumerate(reviews):
scores = parse_score(review)
js_list[idx]['content'] = review
js_list[idx]['tuple'] = scores
review_file.write(json.dumps(js_list[idx]) + '\n')
review_file.close()

View File

@@ -0,0 +1,116 @@
import argparse
import json
import os
import openai
import tqdm
import ray
import time
@ray.remote(num_cpus=4)
def get_eval(content: str, max_tokens: int):
while True:
try:
response = openai.ChatCompletion.create(
model='gpt-4',
messages=[{
'role': 'system',
'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
}, {
'role': 'user',
'content': content,
}],
temperature=0.2, # TODO: figure out which temperature is best for evaluation
max_tokens=max_tokens,
)
break
except openai.error.RateLimitError:
pass
except Exception as e:
print(e)
time.sleep(1)
print('success!')
return response['choices'][0]['message']['content']
def parse_score(review):
try:
score_pair = review.split('\n')[0]
score_pair = score_pair.replace(',', ' ')
sp = score_pair.split(' ')
if len(sp) == 2:
return [float(sp[0]), float(sp[1])]
else:
print('error', review)
return [-1, -1]
except Exception as e:
print(e)
print('error', review)
return [-1, -1]
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
parser.add_argument('-q', '--question')
parser.add_argument('-c', '--context')
parser.add_argument('-a', '--answer-list', nargs='+', default=[])
parser.add_argument('-r', '--rule')
parser.add_argument('-o', '--output')
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
args = parser.parse_args()
ray.init()
f_q = open(os.path.expanduser(args.question))
f_ans1 = open(os.path.expanduser(args.answer_list[0]))
f_ans2 = open(os.path.expanduser(args.answer_list[1]))
rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
review_file = open(f'{args.output}', 'w')
context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
image_to_context = {context['image']: context for context in context_list}
js_list = []
handles = []
idx = 0
for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
ques = json.loads(ques_js)
ans1 = json.loads(ans1_js)
ans2 = json.loads(ans2_js)
inst = image_to_context[ques['image']]
cap_str = '\n'.join(inst['captions'])
box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
category = json.loads(ques_js)['category']
if category in rule_dict:
rule = rule_dict[category]
else:
assert False, f"Visual QA category not found in rule file: {category}."
prompt = rule['prompt']
role = rule['role']
content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
f'[Question]\n{ques["text"]}\n\n'
f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
f'[System]\n{prompt}\n\n')
js_list.append({
'id': idx+1,
'question_id': ques['question_id'],
'answer1_id': ans1.get('answer_id', ans1['question_id']),
'answer2_id': ans2.get('answer_id', ans2['answer_id']),
'category': category})
idx += 1
handles.append(get_eval.remote(content, args.max_tokens))
# To avoid the rate limit set by OpenAI
time.sleep(1)
reviews = ray.get(handles)
for idx, review in enumerate(reviews):
scores = parse_score(review)
js_list[idx]['content'] = review
js_list[idx]['tuple'] = scores
review_file.write(json.dumps(js_list[idx]) + '\n')
review_file.close()

View File

@@ -0,0 +1,99 @@
import argparse
import json
import os
import re
import random
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--base-dir', type=str)
parser.add_argument('--result-file', type=str)
parser.add_argument('--output-file', type=str)
parser.add_argument('--output-result', type=str)
parser.add_argument('--split', type=str, default='test')
parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
return parser.parse_args()
def convert_caps(results):
fakecaps = []
for result in results:
image_id = result['question_id']
caption = result['text']
fakecaps.append({"image_id": int(image_id), "caption": caption})
return fakecaps
def get_pred_idx(prediction, choices, options):
"""
Get the index (e.g. 2) from the prediction (e.g. 'C')
"""
if prediction in options[:len(choices)]:
return options.index(prediction)
else:
return random.choice(range(len(choices)))
if __name__ == "__main__":
args = get_args()
base_dir = args.base_dir
split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
problems = json.load(open(os.path.join(base_dir, "problems.json")))
predictions = [json.loads(line) for line in open(args.result_file)]
predictions = {pred['question_id']: pred for pred in predictions}
split_problems = {idx: problems[idx] for idx in split_indices}
results = {'correct': [], 'incorrect': []}
sqa_results = {}
sqa_results['acc'] = None
sqa_results['correct'] = None
sqa_results['count'] = None
sqa_results['results'] = {}
sqa_results['outputs'] = {}
for prob_id, prob in split_problems.items():
if prob_id not in predictions:
continue
pred = predictions[prob_id]
pred_text = pred['text']
pattern = re.compile(r'The answer is ([A-Z]).')
res = pattern.findall(pred_text)
if len(res) == 1:
answer = res[0] # 'A', 'B', ...
else:
answer = "FAILED"
pred_idx = get_pred_idx(answer, prob['choices'], args.options)
analysis = {
'question_id': prob_id,
'parsed_ans': answer,
'ground_truth': args.options[prob['answer']],
'question': pred['prompt'],
'pred': pred_text,
'is_multimodal': '<image>' in pred['prompt'],
}
sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
sqa_results['outputs'][prob_id] = pred_text
if pred_idx == prob['answer']:
results['correct'].append(analysis)
else:
results['incorrect'].append(analysis)
correct = len(results['correct'])
total = len(results['correct']) + len(results['incorrect'])
print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
sqa_results['acc'] = correct / total * 100
sqa_results['correct'] = correct
sqa_results['count'] = total
with open(args.output_file, 'w') as f:
json.dump(results, f, indent=2)
with open(args.output_result, 'w') as f:
json.dump(sqa_results, f, indent=2)

View File

@@ -0,0 +1,104 @@
import argparse
import json
import os
import re
import random
from collections import defaultdict
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--base-dir', type=str)
parser.add_argument('--gpt4-result', type=str)
parser.add_argument('--our-result', type=str)
parser.add_argument('--split', type=str, default='test')
parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
return parser.parse_args()
def convert_caps(results):
fakecaps = []
for result in results:
image_id = result['question_id']
caption = result['text']
fakecaps.append({"image_id": int(image_id), "caption": caption})
return fakecaps
def get_pred_idx(prediction, choices, options):
"""
Get the index (e.g. 2) from the prediction (e.g. 'C')
"""
if prediction in options[:len(choices)]:
return options.index(prediction)
else:
return random.choice(range(len(choices)))
if __name__ == "__main__":
args = get_args()
base_dir = args.base_dir
split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
problems = json.load(open(os.path.join(base_dir, "problems.json")))
our_predictions = [json.loads(line) for line in open(args.our_result)]
our_predictions = {pred['question_id']: pred for pred in our_predictions}
split_problems = {idx: problems[idx] for idx in split_indices}
gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
results = defaultdict(lambda: 0)
for prob_id, prob in split_problems.items():
if prob_id not in our_predictions:
continue
if prob_id not in gpt4_predictions:
continue
our_pred = our_predictions[prob_id]['text']
gpt4_pred = gpt4_predictions[prob_id]
pattern = re.compile(r'The answer is ([A-Z]).')
our_res = pattern.findall(our_pred)
if len(our_res) == 1:
our_answer = our_res[0] # 'A', 'B', ...
else:
our_answer = "FAILED"
gpt4_res = pattern.findall(gpt4_pred)
if len(gpt4_res) == 1:
gpt4_answer = gpt4_res[0] # 'A', 'B', ...
else:
gpt4_answer = "FAILED"
our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
if gpt4_answer == 'FAILED':
results['gpt4_failed'] += 1
# continue
gpt4_pred_idx = our_pred_idx
# if our_pred_idx != prob['answer']:
# print(our_predictions[prob_id]['prompt'])
# print('-----------------')
# print(f'LECTURE: {prob["lecture"]}')
# print(f'SOLUTION: {prob["solution"]}')
# print('=====================')
else:
# continue
pass
# gpt4_pred_idx = our_pred_idx
if gpt4_pred_idx == prob['answer']:
results['correct'] += 1
else:
results['incorrect'] += 1
if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
results['correct_upperbound'] += 1
correct = results['correct']
total = results['correct'] + results['incorrect']
print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')

View File

@@ -0,0 +1,149 @@
import argparse
import json
import os
import re
import random
from collections import defaultdict
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--base-dir', type=str)
parser.add_argument('--gpt4-result', type=str)
parser.add_argument('--requery-result', type=str)
parser.add_argument('--our-result', type=str)
parser.add_argument('--output-result', type=str)
parser.add_argument('--split', type=str, default='test')
parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
return parser.parse_args()
def convert_caps(results):
fakecaps = []
for result in results:
image_id = result['question_id']
caption = result['text']
fakecaps.append({"image_id": int(image_id), "caption": caption})
return fakecaps
def get_pred_idx(prediction, choices, options):
"""
Get the index (e.g. 2) from the prediction (e.g. 'C')
"""
if prediction in options[:len(choices)]:
return options.index(prediction)
else:
return random.choice(range(len(choices)))
if __name__ == "__main__":
args = get_args()
base_dir = args.base_dir
split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
problems = json.load(open(os.path.join(base_dir, "problems.json")))
our_predictions = [json.loads(line) for line in open(args.our_result)]
our_predictions = {pred['question_id']: pred for pred in our_predictions}
split_problems = {idx: problems[idx] for idx in split_indices}
requery_predictions = [json.loads(line) for line in open(args.requery_result)]
requery_predictions = {pred['question_id']: pred for pred in requery_predictions}
gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
results = defaultdict(lambda: 0)
sqa_results = {}
sqa_results['acc'] = None
sqa_results['correct'] = None
sqa_results['count'] = None
sqa_results['results'] = {}
sqa_results['outputs'] = {}
for prob_id, prob in split_problems.items():
if prob_id not in our_predictions:
assert False
if prob_id not in gpt4_predictions:
assert False
our_pred = our_predictions[prob_id]['text']
gpt4_pred = gpt4_predictions[prob_id]
if prob_id not in requery_predictions:
results['missing_requery'] += 1
requery_pred = "MISSING"
else:
requery_pred = requery_predictions[prob_id]['text']
pattern = re.compile(r'The answer is ([A-Z]).')
our_res = pattern.findall(our_pred)
if len(our_res) == 1:
our_answer = our_res[0] # 'A', 'B', ...
else:
our_answer = "FAILED"
requery_res = pattern.findall(requery_pred)
if len(requery_res) == 1:
requery_answer = requery_res[0] # 'A', 'B', ...
else:
requery_answer = "FAILED"
gpt4_res = pattern.findall(gpt4_pred)
if len(gpt4_res) == 1:
gpt4_answer = gpt4_res[0] # 'A', 'B', ...
else:
gpt4_answer = "FAILED"
our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options)
results['total'] += 1
if gpt4_answer == 'FAILED':
results['gpt4_failed'] += 1
if gpt4_pred_idx == prob['answer']:
results['gpt4_correct'] += 1
if our_pred_idx == prob['answer']:
results['gpt4_ourvisual_correct'] += 1
elif gpt4_pred_idx == prob['answer']:
results['gpt4_correct'] += 1
results['gpt4_ourvisual_correct'] += 1
if our_pred_idx == prob['answer']:
results['our_correct'] += 1
if requery_answer == 'FAILED':
sqa_results['results'][prob_id] = our_pred_idx
if our_pred_idx == prob['answer']:
results['requery_correct'] += 1
else:
sqa_results['results'][prob_id] = requery_pred_idx
if requery_pred_idx == prob['answer']:
results['requery_correct'] += 1
else:
print(f"""
Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']}
Our ({our_answer}): {our_pred}
GPT-4 ({gpt4_answer}): {gpt4_pred}
Requery ({requery_answer}): {requery_pred}
print("=====================================")
""")
if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
results['correct_upperbound'] += 1
total = results['total']
print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%')
print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%')
print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%')
print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%')
print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
sqa_results['acc'] = results["requery_correct"] / total * 100
sqa_results['correct'] = results["requery_correct"]
sqa_results['count'] = total
with open(args.output_result, 'w') as f:
json.dump(sqa_results, f, indent=2)

View File

@@ -0,0 +1,111 @@
"""Generate json file for webpage."""
import json
import os
import re
# models = ['llama', 'alpaca', 'gpt35', 'bard']
models = ['vicuna']
def read_jsonl(path: str, key: str=None):
data = []
with open(os.path.expanduser(path)) as f:
for line in f:
if not line:
continue
data.append(json.loads(line))
if key is not None:
data.sort(key=lambda x: x[key])
data = {item[key]: item for item in data}
return data
def trim_hanging_lines(s: str, n: int) -> str:
s = s.strip()
for _ in range(n):
s = s.split('\n', 1)[1].strip()
return s
if __name__ == '__main__':
questions = read_jsonl('table/question.jsonl', key='question_id')
# alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id')
# bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id')
# gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id')
# llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id')
vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id')
ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id')
review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id')
# review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id')
# review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id')
# review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id')
# review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id')
records = []
for qid in questions.keys():
r = {
'id': qid,
'category': questions[qid]['category'],
'question': questions[qid]['text'],
'answers': {
# 'alpaca': alpaca_answers[qid]['text'],
# 'llama': llama_answers[qid]['text'],
# 'bard': bard_answers[qid]['text'],
# 'gpt35': gpt35_answers[qid]['text'],
'vicuna': vicuna_answers[qid]['text'],
'ours': ours_answers[qid]['text'],
},
'evaluations': {
# 'alpaca': review_alpaca[qid]['text'],
# 'llama': review_llama[qid]['text'],
# 'bard': review_bard[qid]['text'],
'vicuna': review_vicuna[qid]['content'],
# 'gpt35': review_gpt35[qid]['text'],
},
'scores': {
'vicuna': review_vicuna[qid]['tuple'],
# 'alpaca': review_alpaca[qid]['score'],
# 'llama': review_llama[qid]['score'],
# 'bard': review_bard[qid]['score'],
# 'gpt35': review_gpt35[qid]['score'],
},
}
# cleanup data
cleaned_evals = {}
for k, v in r['evaluations'].items():
v = v.strip()
lines = v.split('\n')
# trim the first line if it's a pair of numbers
if re.match(r'\d+[, ]+\d+', lines[0]):
lines = lines[1:]
v = '\n'.join(lines)
cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**')
r['evaluations'] = cleaned_evals
records.append(r)
# Reorder the records, this is optional
for r in records:
if r['id'] <= 20:
r['id'] += 60
else:
r['id'] -= 20
for r in records:
if r['id'] <= 50:
r['id'] += 10
elif 50 < r['id'] <= 60:
r['id'] -= 50
for r in records:
if r['id'] == 7:
r['id'] = 1
elif r['id'] < 7:
r['id'] += 1
records.sort(key=lambda x: x['id'])
# Write to file
with open('webpage/data.json', 'w') as f:
json.dump({'questions': records, 'models': models}, f, indent=2)

View File

@@ -0,0 +1,84 @@
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
import torch
import os
import json
from tqdm import tqdm
import shortuuid
from llava.conversation import default_conversation
from llava.utils import disable_torch_init
# new stopping implementation
class KeywordsStoppingCriteria(StoppingCriteria):
def __init__(self, keywords, tokenizer, input_ids):
self.keywords = keywords
self.tokenizer = tokenizer
self.start_len = None
self.input_ids = input_ids
def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
if self.start_len is None:
self.start_len = self.input_ids.shape[1]
else:
outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
for keyword in self.keywords:
if keyword in outputs:
return True
return False
@torch.inference_mode()
def eval_model(model_name, questions_file, answers_file):
# Model
disable_torch_init()
model_name = os.path.expanduser(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,
torch_dtype=torch.float16).cuda()
ques_file = open(os.path.expanduser(questions_file), "r")
ans_file = open(os.path.expanduser(answers_file), "w")
for i, line in enumerate(tqdm(ques_file)):
idx = json.loads(line)["question_id"]
qs = json.loads(line)["text"]
cat = json.loads(line)["category"]
conv = default_conversation.copy()
conv.append_message(conv.roles[0], qs)
prompt = conv.get_prompt()
inputs = tokenizer([prompt])
input_ids = torch.as_tensor(inputs.input_ids).cuda()
stopping_criteria = KeywordsStoppingCriteria([conv.sep], tokenizer, input_ids)
output_ids = model.generate(
input_ids,
do_sample=True,
temperature=0.7,
max_new_tokens=1024,
stopping_criteria=[stopping_criteria])
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
try:
index = outputs.index(conv.sep, len(prompt))
except ValueError:
outputs += conv.sep
index = outputs.index(conv.sep, len(prompt))
outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
ans_id = shortuuid.uuid()
ans_file.write(json.dumps({"question_id": idx,
"text": outputs,
"answer_id": ans_id,
"model_id": model_name,
"metadata": {}}) + "\n")
ans_file.flush()
ans_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
args = parser.parse_args()
eval_model(args.model_name, args.question_file, args.answers_file)

View File

@@ -0,0 +1,207 @@
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
import torch
import os
import json
from tqdm import tqdm
import shortuuid
from llava import LlavaLlamaForCausalLM
from llava.conversation import conv_templates
from llava.utils import disable_torch_init
from transformers import CLIPVisionModel, CLIPImageProcessor, StoppingCriteria
from PIL import Image
import random
import math
def split_list(lst, n):
"""Split a list into n (roughly) equal-sized chunks"""
chunk_size = math.ceil(len(lst) / n) # integer division
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
def get_chunk(lst, n, k):
chunks = split_list(lst, n)
return chunks[k]
DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"
def patch_config(config):
patch_dict = {
"use_mm_proj": True,
"mm_vision_tower": "openai/clip-vit-large-patch14",
"mm_hidden_size": 1024
}
cfg = AutoConfig.from_pretrained(config)
if not hasattr(cfg, "mm_vision_tower"):
print(f'`mm_vision_tower` not found in `{config}`, applying patch and save to disk.')
for k, v in patch_dict.items():
setattr(cfg, k, v)
cfg.save_pretrained(config)
def eval_model(args):
# Model
disable_torch_init()
model_name = os.path.expanduser(args.model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if args.mm_projector is None:
patch_config(model_name)
model = LlavaLlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).cuda()
image_processor = CLIPImageProcessor.from_pretrained(model.config.mm_vision_tower, torch_dtype=torch.float16)
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
vision_tower = model.model.vision_tower[0]
vision_tower.to(device='cuda', dtype=torch.float16)
vision_config = vision_tower.config
vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]
vision_config.use_im_start_end = mm_use_im_start_end
if mm_use_im_start_end:
vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
image_token_len = (vision_config.image_size // vision_config.patch_size) ** 2
else:
# in case of using a pretrained model with only a MLP projector weights
model = LlavaLlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).cuda()
vision_tower = CLIPVisionModel.from_pretrained(args.vision_tower, torch_dtype=torch.float16).cuda()
image_processor = CLIPImageProcessor.from_pretrained(args.vision_tower, torch_dtype=torch.float16)
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
vision_config = vision_tower.config
vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]
vision_config.use_im_start_end = mm_use_im_start_end
if mm_use_im_start_end:
vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
image_token_len = (vision_config.image_size // vision_config.patch_size) ** 2
mm_projector = torch.nn.Linear(vision_config.hidden_size, model.config.hidden_size)
mm_projector_weights = torch.load(args.mm_projector, map_location='cpu')
mm_projector.load_state_dict({k.split('.')[-1]: v for k, v in mm_projector_weights.items()})
model.model.mm_projector = mm_projector.cuda().half()
model.model.vision_tower = [vision_tower]
questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
answers_file = os.path.expanduser(args.answers_file)
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
ans_file = open(answers_file, "w")
for i, line in enumerate(tqdm(questions)):
idx = line["question_id"]
image_file = line["image"]
qs = line["text"]
cur_prompt = qs
if mm_use_im_start_end:
qs = qs + '\n' + DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len + DEFAULT_IM_END_TOKEN
else:
qs = qs + '\n' + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
if args.conv_mode == 'simple_legacy':
qs += '\n\n### Response:'
# conv = default_conversation.copy()
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
prompt = conv.get_prompt()
inputs = tokenizer([prompt])
image = Image.open(os.path.join(args.image_folder, image_file))
# image.save(os.path.join(save_image_folder, image_file))
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
input_ids = torch.as_tensor(inputs.input_ids).cuda()
# new stopping implementation
class KeywordsStoppingCriteria(StoppingCriteria):
def __init__(self, keywords, tokenizer, input_ids):
self.keywords = keywords
self.tokenizer = tokenizer
self.start_len = None
self.input_ids = input_ids
def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
if self.start_len is None:
self.start_len = self.input_ids.shape[1]
else:
outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
for keyword in self.keywords:
if keyword in outputs:
return True
return False
keywords = ['###']
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image_tensor.unsqueeze(0).half().cuda(),
do_sample=True,
temperature=0.7,
max_new_tokens=1024,
stopping_criteria=[stopping_criteria])
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] Sample {i}: {n_diff_input_output} output_ids are not the same as the input_ids')
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
if args.conv_mode == 'simple_legacy' or args.conv_mode == 'simple':
while True:
cur_len = len(outputs)
outputs = outputs.strip()
for pattern in ['###', 'Assistant:', 'Response:']:
if outputs.startswith(pattern):
outputs = outputs[len(pattern):].strip()
if len(outputs) == cur_len:
break
try:
index = outputs.index(conv.sep)
except ValueError:
outputs += conv.sep
index = outputs.index(conv.sep)
outputs = outputs[:index].strip()
ans_id = shortuuid.uuid()
ans_file.write(json.dumps({"question_id": idx,
"prompt": cur_prompt,
"text": outputs,
"answer_id": ans_id,
"model_id": model_name,
"metadata": {}}) + "\n")
ans_file.flush()
ans_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
parser.add_argument("--image-folder", type=str, default="")
parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
parser.add_argument("--mm-projector", type=str, default=None)
parser.add_argument("--vision-tower", type=str, default=None)
parser.add_argument("--conv-mode", type=str, default="simple")
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
args = parser.parse_args()
eval_model(args)

View File

@@ -0,0 +1,309 @@
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
import torch
import os
import json
from tqdm import tqdm
import shortuuid
from llava import LlavaLlamaForCausalLM
from llava.conversation import conv_templates
from llava.utils import disable_torch_init
from transformers import CLIPVisionModel, CLIPImageProcessor, StoppingCriteria
from PIL import Image
import random
import math
def split_list(lst, n):
"""Split a list into n (roughly) equal-sized chunks"""
chunk_size = math.ceil(len(lst) / n) # integer division
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
def get_chunk(lst, n, k):
chunks = split_list(lst, n)
return chunks[k]
DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"
detail_describe_instructions = [
"Describe the following image in detail.",
"Provide a detailed description of the given image.",
"Give an elaborate explanation of the image you see.",
"Share a comprehensive rundown of the presented image.",
"Offer a thorough analysis of the image.",
"Explain the various aspects of the image before you.",
"Clarify the contents of the displayed image with great detail.",
"Characterize the image using a well-detailed description.",
"Break down the elements of the image in a detailed manner.",
"Walk through the important details of the image.",
"Portray the image with a rich, descriptive narrative.",
"Narrate the contents of the image with precision.",
"Analyze the image in a comprehensive and detailed manner.",
"Illustrate the image through a descriptive explanation.",
"Examine the image closely and share its details.",
"Write an exhaustive depiction of the given image.",
]
concise_describe_instructions = [
"Describe the following image concisely.",
"Provide a brief description of the given image.",
"Offer a succinct explanation of the picture presented.",
"Summarize the visual content of the following image.",
"Give a short and clear explanation of the subsequent image.",
"Share a concise interpretation of the image provided.",
"Present a compact description of the photo's key features.",
"Relay a brief, clear account of the picture shown.",
"Render a clear and concise summary of the photo below.",
"Write a terse but informative summary of the following picture.",
"Create a compact narrative representing the image presented.",
]
prompt_pool = detail_describe_instructions + concise_describe_instructions
prompt_pool = [ "Describe the following image in detail."]
def patch_config(config):
patch_dict = {
"use_mm_proj": True,
"mm_vision_tower": "openai/clip-vit-large-patch14",
"mm_hidden_size": 1024
}
cfg = AutoConfig.from_pretrained(config)
if not hasattr(cfg, "mm_vision_tower"):
print(f'`mm_vision_tower` not found in `{config}`, applying patch and save to disk.')
for k, v in patch_dict.items():
setattr(cfg, k, v)
cfg.save_pretrained(config)
# new stopping implementation
class KeywordsStoppingCriteria(StoppingCriteria):
def __init__(self, keywords, tokenizer, input_ids):
self.keywords = keywords
self.tokenizer = tokenizer
self.start_len = None
self.input_ids = input_ids
def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
if self.start_len is None:
self.start_len = self.input_ids.shape[1]
else:
outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
for keyword in self.keywords:
if keyword in outputs:
return True
return False
def eval_model(args):
# Model
disable_torch_init()
model_name = os.path.expanduser(args.model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if args.mm_projector is None:
patch_config(model_name)
model = LlavaLlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, use_cache=True).cuda()
image_processor = CLIPImageProcessor.from_pretrained(model.config.mm_vision_tower, torch_dtype=torch.float16)
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
vision_tower = model.model.vision_tower[0]
vision_tower.to(device='cuda', dtype=torch.float16)
vision_config = vision_tower.config
vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]
vision_config.use_im_start_end = mm_use_im_start_end
if mm_use_im_start_end:
vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
image_token_len = (vision_config.image_size // vision_config.patch_size) ** 2
else:
# in case of using a pretrained model with only a MLP projector weights
model = LlavaLlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, use_cache=True).cuda()
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
vision_tower = CLIPVisionModel.from_pretrained(args.vision_tower, torch_dtype=torch.float16).cuda()
image_processor = CLIPImageProcessor.from_pretrained(args.vision_tower, torch_dtype=torch.float16)
vision_config = vision_tower.config
vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]
vision_config.use_im_start_end = mm_use_im_start_end
if mm_use_im_start_end:
vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
image_token_len = (vision_config.image_size // vision_config.patch_size) ** 2
mm_projector = torch.nn.Linear(vision_config.hidden_size, model.config.hidden_size)
mm_projector_weights = torch.load(args.mm_projector, map_location='cpu')
mm_projector.load_state_dict({k.split('.')[-1]: v for k, v in mm_projector_weights.items()})
model.model.mm_projector = mm_projector.cuda().half()
model.model.vision_tower = [vision_tower]
questions = json.load(open(os.path.expanduser(args.question_file), "r"))
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
answers_file = os.path.expanduser(args.answers_file)
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
os.makedirs(os.path.join(os.path.dirname(answers_file), "images"), exist_ok=True)
ans_file = open(answers_file, "w")
save_image_folder = os.path.join(os.path.dirname(os.path.expanduser(args.answers_file)), "images")
for i, line in enumerate(tqdm(questions)):
idx = line["id"]
question = line['conversations'][0]
gt_ans = line["conversations"][1]
qs = question['value']
qs = qs.replace('<image>', '').strip()
cur_prompt = qs
if 'image' in line:
image_file = line["image"]
image = Image.open(os.path.join(args.image_folder, image_file))
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
images = image_tensor.unsqueeze(0).half().cuda()
if getattr(model.config, 'mm_use_im_start_end', False):
qs = qs + '\n' + DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len + DEFAULT_IM_END_TOKEN
else:
qs = qs + '\n' + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
cur_prompt = cur_prompt + '\n' + '<image>'
else:
images = None
if args.conv_mode == 'simple_legacy':
qs += '\n\n### Response:'
assert gt_ans['from'] == 'gpt'
# conv = default_conversation.copy()
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
prompt = conv.get_prompt()
inputs = tokenizer([prompt])
input_ids = torch.as_tensor(inputs.input_ids).cuda()
keywords = ['###']
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=images,
do_sample=True,
temperature=0.7,
max_new_tokens=1024,
stopping_criteria=[stopping_criteria])
# TODO: new implementation
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] Sample {i}: {n_diff_input_output} output_ids are not the same as the input_ids')
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
if args.conv_mode == 'simple_legacy':
while True:
cur_len = len(outputs)
outputs = outputs.strip()
for pattern in ['###', 'Assistant:', 'Response:']:
if outputs.startswith(pattern):
outputs = outputs[len(pattern):].strip()
if len(outputs) == cur_len:
break
try:
index = outputs.index(conv.sep)
except ValueError:
outputs += conv.sep
index = outputs.index(conv.sep)
outputs = outputs[:index].strip()
# prompt for answer
if args.answer_prompter:
outputs_reasoning = outputs
inputs = tokenizer([prompt + outputs_reasoning + ' ###\nANSWER:'])
input_ids = torch.as_tensor(inputs.input_ids).cuda()
keywords = ['###']
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=images,
do_sample=True,
temperature=0.7,
max_new_tokens=64,
stopping_criteria=[stopping_criteria])
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] Sample {i}: {n_diff_input_output} output_ids are not the same as the input_ids')
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
try:
index = outputs.index(conv.sep)
except ValueError:
outputs += conv.sep
index = outputs.index(conv.sep)
outputs = outputs[:index].strip()
outputs = outputs_reasoning + '\n The answer is ' + outputs
# new implementation ends
# original implementation
# outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
# try:
# index = outputs.index(conv.sep, len(prompt))
# except ValueError:
# outputs += conv.sep
# index = outputs.index(conv.sep, len(prompt))
# outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
ans_id = shortuuid.uuid()
ans_file.write(json.dumps({"question_id": idx,
"prompt": cur_prompt,
"text": outputs,
"answer_id": ans_id,
"model_id": model_name,
"metadata": {}}) + "\n")
ans_file.flush()
ans_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
parser.add_argument("--image-folder", type=str, default="")
parser.add_argument("--question-file", type=str, default="tables/question.json")
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
parser.add_argument("--mm-projector", type=str, default=None)
parser.add_argument("--vision-tower", type=str, default=None)
parser.add_argument("--conv-mode", type=str, default="simple")
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
parser.add_argument("--answer-prompter", action="store_true")
args = parser.parse_args()
eval_model(args)

View File

@@ -0,0 +1,74 @@
"""Generate answers with GPT-3.5"""
# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
import argparse
import json
import os
import time
import concurrent.futures
import openai
import tqdm
import shortuuid
MODEL = 'gpt-3.5-turbo'
MODEL_ID = 'gpt-3.5-turbo:20230327'
def get_answer(question_id: int, question: str, max_tokens: int):
ans = {
'answer_id': shortuuid.uuid(),
'question_id': question_id,
'model_id': MODEL_ID,
}
for _ in range(3):
try:
response = openai.ChatCompletion.create(
model=MODEL,
messages=[{
'role': 'system',
'content': 'You are a helpful assistant.'
}, {
'role': 'user',
'content': question,
}],
max_tokens=max_tokens,
)
ans['text'] = response['choices'][0]['message']['content']
return ans
except Exception as e:
print('[ERROR]', e)
ans['text'] = '#ERROR#'
time.sleep(1)
return ans
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
parser.add_argument('-q', '--question')
parser.add_argument('-o', '--output')
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
args = parser.parse_args()
questions_dict = {}
with open(os.path.expanduser(args.question)) as f:
for line in f:
if not line:
continue
q = json.loads(line)
questions_dict[q['question_id']] = q['text']
answers = []
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
futures = []
for qid, question in questions_dict.items():
future = executor.submit(get_answer, qid, question, args.max_tokens)
futures.append(future)
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
answers.append(future.result())
answers.sort(key=lambda x: x['question_id'])
with open(os.path.expanduser(args.output), 'w') as f:
table = [json.dumps(ans) for ans in answers]
f.write('\n'.join(table))

View File

@@ -0,0 +1,125 @@
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
from llava.conversation import conv_templates, SeparatorStyle
from llava.utils import disable_torch_init
from transformers import CLIPVisionModel, CLIPImageProcessor, StoppingCriteria
from llava.model import *
from llava.model.utils import KeywordsStoppingCriteria
from PIL import Image
import os
import requests
from PIL import Image
from io import BytesIO
DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"
def load_image(image_file):
if image_file.startswith('http') or image_file.startswith('https'):
response = requests.get(image_file)
image = Image.open(BytesIO(response.content)).convert('RGB')
else:
image = Image.open(image_file).convert('RGB')
return image
def eval_model(args):
# Model
disable_torch_init()
model_name = os.path.expanduser(args.model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if "mpt" in model_name.lower():
model = LlavaMPTForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16, use_cache=True).cuda()
else:
model = LlavaLlamaForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16, use_cache=True).cuda()
image_processor = CLIPImageProcessor.from_pretrained(model.config.mm_vision_tower, torch_dtype=torch.float16)
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
vision_tower = model.get_model().vision_tower[0]
if vision_tower.device.type == 'meta':
vision_tower = CLIPVisionModel.from_pretrained(vision_tower.config._name_or_path, torch_dtype=torch.float16, low_cpu_mem_usage=True).cuda()
model.get_model().vision_tower[0] = vision_tower
else:
vision_tower.to(device='cuda', dtype=torch.float16)
vision_config = vision_tower.config
vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]
vision_config.use_im_start_end = mm_use_im_start_end
if mm_use_im_start_end:
vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
image_token_len = (vision_config.image_size // vision_config.patch_size) ** 2
qs = args.query
if mm_use_im_start_end:
qs = qs + '\n' + DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len + DEFAULT_IM_END_TOKEN
else:
qs = qs + '\n' + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
if "v1" in model_name.lower():
conv_mode = "llava_v1"
elif "mpt" in model_name.lower():
conv_mode = "mpt_multimodal"
else:
conv_mode = "multimodal"
if args.conv_mode is not None and conv_mode != args.conv_mode:
print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
else:
args.conv_mode = conv_mode
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
inputs = tokenizer([prompt])
image = load_image(args.image_file)
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
input_ids = torch.as_tensor(inputs.input_ids).cuda()
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image_tensor.unsqueeze(0).half().cuda(),
do_sample=True,
temperature=0.2,
max_new_tokens=1024,
stopping_criteria=[stopping_criteria])
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[:-len(stop_str)]
outputs = outputs.strip()
print(outputs)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
parser.add_argument("--image-file", type=str, required=True)
parser.add_argument("--query", type=str, required=True)
parser.add_argument("--conv-mode", type=str, default=None)
args = parser.parse_args()
eval_model(args)

View File

@@ -0,0 +1,26 @@
import json
import os
from collections import defaultdict
import numpy as np
if __name__ == '__main__':
base_dir = "vqa/reviews/coco2014_val80"
review_files = [x for x in os.listdir(base_dir) if x.endswith('.jsonl') and x.startswith('gpt4_text')]
for review_file in sorted(review_files):
config = review_file.replace('gpt4_text_', '').replace('.jsonl', '')
scores = defaultdict(list)
print(f'GPT-4 vs. {config}')
with open(os.path.join(base_dir, review_file)) as f:
for review_str in f:
review = json.loads(review_str)
scores[review['category']].append(review['tuple'])
scores['all'].append(review['tuple'])
for k, v in scores.items():
stats = np.asarray(v).mean(0).tolist()
stats = [round(x, 3) for x in stats]
print(k, stats, round(stats[1]/stats[0]*100, 1))
print('=================================')