Add files via upload
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,22 @@
|
|||||||
|
from torch.utils.data import Dataset
|
||||||
|
import os
|
||||||
|
class ocrDataset(Dataset):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
image_dir_path= "./data/ocr",
|
||||||
|
dataset_name = "ct80"
|
||||||
|
):
|
||||||
|
self.image_dir_path = image_dir_path
|
||||||
|
self.dataset_name = dataset_name
|
||||||
|
file_path = os.path.join(image_dir_path, f'{dataset_name}/test_label.txt')
|
||||||
|
file = open(file_path, "r")
|
||||||
|
self.lines = file.readlines()
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.lines)
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
image_id = self.lines[idx].split()[0]
|
||||||
|
img_path = os.path.join(self.image_dir_path,f'{self.dataset_name}/{image_id}')
|
||||||
|
answers = self.lines[idx].split()[1]
|
||||||
|
return {
|
||||||
|
"image_path": img_path,
|
||||||
|
"gt_answers": answers}
|
||||||
@@ -0,0 +1,103 @@
|
|||||||
|
from torch.utils.data import Dataset
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
class textVQADataset(Dataset):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
image_dir_path= "./data/textVQA/train_images",
|
||||||
|
ann_path= "./data/textVQA/TextVQA_0.5.1_val.json"
|
||||||
|
):
|
||||||
|
|
||||||
|
self.data = json.load(open(ann_path, "r"))["data"]
|
||||||
|
self.image_dir_path = image_dir_path
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.data)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
question = self.data[idx]['question']
|
||||||
|
answers = self.data[idx]['answers']
|
||||||
|
img_path = os.path.join(self.image_dir_path, f"{self.data[idx]['image_id']}.jpg")
|
||||||
|
return {
|
||||||
|
"image_path": img_path,
|
||||||
|
"question": question,
|
||||||
|
"gt_answers": answers}
|
||||||
|
|
||||||
|
class docVQADataset(Dataset):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
image_dir_path= "./data/docVQA/val",
|
||||||
|
ann_path= "./data/docVQA/val/val_v1.0.json",
|
||||||
|
):
|
||||||
|
|
||||||
|
self.data = json.load(open(ann_path, "r"))["data"]
|
||||||
|
self.image_dir_path = image_dir_path
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.data)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
question = self.data[idx]['question']
|
||||||
|
answers = self.data[idx]['answers']
|
||||||
|
img_path = os.path.join(self.image_dir_path, self.data[idx]['image'])
|
||||||
|
return {
|
||||||
|
"image_path": img_path,
|
||||||
|
"question": question,
|
||||||
|
"gt_answers": answers}
|
||||||
|
|
||||||
|
class ocrVQADataset(Dataset):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
image_dir_path= "./data/ocrVQA/images",
|
||||||
|
ann_path= "./data/ocrVQA/dataset.json",
|
||||||
|
):
|
||||||
|
self.image_list = []
|
||||||
|
self.question_list = []
|
||||||
|
self.answer_list = []
|
||||||
|
dataset = json.load(open(ann_path, "r"))
|
||||||
|
import pdb;pdb.set_trace()
|
||||||
|
for idx, data in enumerate(dataset):
|
||||||
|
questions = dataset[data]['questions']
|
||||||
|
for index, question in enumerate(questions):
|
||||||
|
image_file = os.path.join(image_dir_path, f'{data}.jpg')
|
||||||
|
gt_answers = dataset[data]['answers'][index]
|
||||||
|
self.image_list.append(image_file)
|
||||||
|
self.answer_list.append(gt_answers)
|
||||||
|
self.question_list.append(question)
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.data)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
question = self.question_list[idx]
|
||||||
|
answers = self.answer_list[idx]
|
||||||
|
img_path = self.image_list[idx]
|
||||||
|
return {
|
||||||
|
"image_path": img_path,
|
||||||
|
"question": question,
|
||||||
|
"gt_answers": answers}
|
||||||
|
|
||||||
|
class STVQADataset(Dataset):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
image_dir_path= "./data/STVQA",
|
||||||
|
ann_path= "./data/STVQA/train_task_3.json",
|
||||||
|
):
|
||||||
|
self.image_list = []
|
||||||
|
self.question_list = []
|
||||||
|
self.answer_list = []
|
||||||
|
data = json.load(open(ann_path, "r"))
|
||||||
|
for i in range(len(data)):
|
||||||
|
image_path = image_dir_path+'/'+data['data'][i]['dataset']+'/'+data['data'][i]['file_name']
|
||||||
|
self.image_list.append(image_path)
|
||||||
|
self.answer_list.append(data['data'][i]['answers'])
|
||||||
|
self.question_list.append(data['data'][i]['question'])
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.data)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
question = self.question_list[idx]
|
||||||
|
answers = self.answer_list[idx]
|
||||||
|
img_path = self.image_list[idx]
|
||||||
|
return {
|
||||||
|
"image_path": img_path,
|
||||||
|
"question": question,
|
||||||
|
"gt_answers": answers}
|
||||||
@@ -0,0 +1,440 @@
|
|||||||
|
import argparse
|
||||||
|
#from models.BLIP2.BLIP2 import BLIP2
|
||||||
|
import more_itertools
|
||||||
|
from tqdm import tqdm
|
||||||
|
import datetime
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from datasets.vqa_dataset import textVQADataset, docVQADataset, ocrVQADataset, STVQADataset
|
||||||
|
from datasets.ocr_dataset import ocrDataset
|
||||||
|
from models.lavis.lavis import lavis
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
def get_model(args):
|
||||||
|
if args.model_name=='BLIP2':
|
||||||
|
#model = BLIP2(args.BLIP2_model_path, args.device)
|
||||||
|
model = lavis(args.BLIP2_model_name, args.BLIP2_model_type, args.device)
|
||||||
|
#elif args.model_name=='mPLUG-Owl':
|
||||||
|
# model =
|
||||||
|
return model
|
||||||
|
def has_word(sentence, word):
|
||||||
|
pattern = r"\b" + re.escape(word) + r"\b"
|
||||||
|
match = re.search(pattern, sentence)
|
||||||
|
if match:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
def remove_special_chars(s):
|
||||||
|
pattern = r"[^a-zA-Z0-9\s]"
|
||||||
|
s = re.sub(pattern, "", s)
|
||||||
|
return s
|
||||||
|
class VQAEval:
|
||||||
|
def __init__(self):
|
||||||
|
self.contractions = {
|
||||||
|
"aint": "ain't",
|
||||||
|
"arent": "aren't",
|
||||||
|
"cant": "can't",
|
||||||
|
"couldve": "could've",
|
||||||
|
"couldnt": "couldn't",
|
||||||
|
"couldn'tve": "couldn't've",
|
||||||
|
"couldnt've": "couldn't've",
|
||||||
|
"didnt": "didn't",
|
||||||
|
"doesnt": "doesn't",
|
||||||
|
"dont": "don't",
|
||||||
|
"hadnt": "hadn't",
|
||||||
|
"hadnt've": "hadn't've",
|
||||||
|
"hadn'tve": "hadn't've",
|
||||||
|
"hasnt": "hasn't",
|
||||||
|
"havent": "haven't",
|
||||||
|
"hed": "he'd",
|
||||||
|
"hed've": "he'd've",
|
||||||
|
"he'dve": "he'd've",
|
||||||
|
"hes": "he's",
|
||||||
|
"howd": "how'd",
|
||||||
|
"howll": "how'll",
|
||||||
|
"hows": "how's",
|
||||||
|
"Id've": "I'd've",
|
||||||
|
"I'dve": "I'd've",
|
||||||
|
"Im": "I'm",
|
||||||
|
"Ive": "I've",
|
||||||
|
"isnt": "isn't",
|
||||||
|
"itd": "it'd",
|
||||||
|
"itd've": "it'd've",
|
||||||
|
"it'dve": "it'd've",
|
||||||
|
"itll": "it'll",
|
||||||
|
"let's": "let's",
|
||||||
|
"maam": "ma'am",
|
||||||
|
"mightnt": "mightn't",
|
||||||
|
"mightnt've": "mightn't've",
|
||||||
|
"mightn'tve": "mightn't've",
|
||||||
|
"mightve": "might've",
|
||||||
|
"mustnt": "mustn't",
|
||||||
|
"mustve": "must've",
|
||||||
|
"neednt": "needn't",
|
||||||
|
"notve": "not've",
|
||||||
|
"oclock": "o'clock",
|
||||||
|
"oughtnt": "oughtn't",
|
||||||
|
"ow's'at": "'ow's'at",
|
||||||
|
"'ows'at": "'ow's'at",
|
||||||
|
"'ow'sat": "'ow's'at",
|
||||||
|
"shant": "shan't",
|
||||||
|
"shed've": "she'd've",
|
||||||
|
"she'dve": "she'd've",
|
||||||
|
"she's": "she's",
|
||||||
|
"shouldve": "should've",
|
||||||
|
"shouldnt": "shouldn't",
|
||||||
|
"shouldnt've": "shouldn't've",
|
||||||
|
"shouldn'tve": "shouldn't've",
|
||||||
|
"somebody'd": "somebodyd",
|
||||||
|
"somebodyd've": "somebody'd've",
|
||||||
|
"somebody'dve": "somebody'd've",
|
||||||
|
"somebodyll": "somebody'll",
|
||||||
|
"somebodys": "somebody's",
|
||||||
|
"someoned": "someone'd",
|
||||||
|
"someoned've": "someone'd've",
|
||||||
|
"someone'dve": "someone'd've",
|
||||||
|
"someonell": "someone'll",
|
||||||
|
"someones": "someone's",
|
||||||
|
"somethingd": "something'd",
|
||||||
|
"somethingd've": "something'd've",
|
||||||
|
"something'dve": "something'd've",
|
||||||
|
"somethingll": "something'll",
|
||||||
|
"thats": "that's",
|
||||||
|
"thered": "there'd",
|
||||||
|
"thered've": "there'd've",
|
||||||
|
"there'dve": "there'd've",
|
||||||
|
"therere": "there're",
|
||||||
|
"theres": "there's",
|
||||||
|
"theyd": "they'd",
|
||||||
|
"theyd've": "they'd've",
|
||||||
|
"they'dve": "they'd've",
|
||||||
|
"theyll": "they'll",
|
||||||
|
"theyre": "they're",
|
||||||
|
"theyve": "they've",
|
||||||
|
"twas": "'twas",
|
||||||
|
"wasnt": "wasn't",
|
||||||
|
"wed've": "we'd've",
|
||||||
|
"we'dve": "we'd've",
|
||||||
|
"weve": "we've",
|
||||||
|
"werent": "weren't",
|
||||||
|
"whatll": "what'll",
|
||||||
|
"whatre": "what're",
|
||||||
|
"whats": "what's",
|
||||||
|
"whatve": "what've",
|
||||||
|
"whens": "when's",
|
||||||
|
"whered": "where'd",
|
||||||
|
"wheres": "where's",
|
||||||
|
"whereve": "where've",
|
||||||
|
"whod": "who'd",
|
||||||
|
"whod've": "who'd've",
|
||||||
|
"who'dve": "who'd've",
|
||||||
|
"wholl": "who'll",
|
||||||
|
"whos": "who's",
|
||||||
|
"whove": "who've",
|
||||||
|
"whyll": "why'll",
|
||||||
|
"whyre": "why're",
|
||||||
|
"whys": "why's",
|
||||||
|
"wont": "won't",
|
||||||
|
"wouldve": "would've",
|
||||||
|
"wouldnt": "wouldn't",
|
||||||
|
"wouldnt've": "wouldn't've",
|
||||||
|
"wouldn'tve": "wouldn't've",
|
||||||
|
"yall": "y'all",
|
||||||
|
"yall'll": "y'all'll",
|
||||||
|
"y'allll": "y'all'll",
|
||||||
|
"yall'd've": "y'all'd've",
|
||||||
|
"y'alld've": "y'all'd've",
|
||||||
|
"y'all'dve": "y'all'd've",
|
||||||
|
"youd": "you'd",
|
||||||
|
"youd've": "you'd've",
|
||||||
|
"you'dve": "you'd've",
|
||||||
|
"youll": "you'll",
|
||||||
|
"youre": "you're",
|
||||||
|
"youve": "you've",
|
||||||
|
}
|
||||||
|
self.manualMap = {
|
||||||
|
"none": "0",
|
||||||
|
"zero": "0",
|
||||||
|
"one": "1",
|
||||||
|
"two": "2",
|
||||||
|
"three": "3",
|
||||||
|
"four": "4",
|
||||||
|
"five": "5",
|
||||||
|
"six": "6",
|
||||||
|
"seven": "7",
|
||||||
|
"eight": "8",
|
||||||
|
"nine": "9",
|
||||||
|
"ten": "10",
|
||||||
|
}
|
||||||
|
self.articles = ["a", "an", "the"]
|
||||||
|
|
||||||
|
self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
|
||||||
|
self.commaStrip = re.compile("(\d)(\,)(\d)")
|
||||||
|
self.punct = [
|
||||||
|
";",
|
||||||
|
r"/",
|
||||||
|
"[",
|
||||||
|
"]",
|
||||||
|
'"',
|
||||||
|
"{",
|
||||||
|
"}",
|
||||||
|
"(",
|
||||||
|
")",
|
||||||
|
"=",
|
||||||
|
"+",
|
||||||
|
"\\",
|
||||||
|
"_",
|
||||||
|
"-",
|
||||||
|
">",
|
||||||
|
"<",
|
||||||
|
"@",
|
||||||
|
"`",
|
||||||
|
",",
|
||||||
|
"?",
|
||||||
|
"!",
|
||||||
|
]
|
||||||
|
|
||||||
|
def evaluate(self, answer, gt_answers):
|
||||||
|
answer = answer.replace("\n", " ")
|
||||||
|
answer = answer.replace("\t", " ")
|
||||||
|
answer = answer.strip()
|
||||||
|
answer = self.processPunctuation(answer)
|
||||||
|
answer = self.processDigitArticle(answer)
|
||||||
|
if type(gt_answers)==list:
|
||||||
|
for i in range(len(gt_answers)):
|
||||||
|
gt_answers[i] = gt_answers[i].replace("\n", " ")
|
||||||
|
gt_answers[i] = gt_answers[i].replace("\t", " ")
|
||||||
|
gt_answers[i] = gt_answers[i].strip()
|
||||||
|
gt_answers[i] = self.processPunctuation(gt_answers[i])
|
||||||
|
gt_answers[i] = self.processDigitArticle(gt_answers[i])
|
||||||
|
if has_word(answer, gt_answers[i]):
|
||||||
|
return 1
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
gt_answers = gt_answers.replace("\n", " ")
|
||||||
|
gt_answers= gt_answers.replace("\t", " ")
|
||||||
|
gt_answers = gt_answers.strip()
|
||||||
|
gt_answers = self.processPunctuation(gt_answers)
|
||||||
|
gt_answers = self.processDigitArticle(gt_answers)
|
||||||
|
if has_word(answer, gt_answers[i]):
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def processPunctuation(self, inText):
|
||||||
|
outText = inText
|
||||||
|
for p in self.punct:
|
||||||
|
if (p + " " in inText or " " + p in inText) or (
|
||||||
|
re.search(self.commaStrip, inText) != None
|
||||||
|
):
|
||||||
|
outText = outText.replace(p, "")
|
||||||
|
else:
|
||||||
|
outText = outText.replace(p, " ")
|
||||||
|
outText = self.periodStrip.sub("", outText, re.UNICODE)
|
||||||
|
return outText
|
||||||
|
|
||||||
|
def processDigitArticle(self, inText):
|
||||||
|
outText = []
|
||||||
|
tempText = inText.lower().split()
|
||||||
|
for word in tempText:
|
||||||
|
word = self.manualMap.setdefault(word, word)
|
||||||
|
if word not in self.articles:
|
||||||
|
outText.append(word)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
for wordId, word in enumerate(outText):
|
||||||
|
if word in self.contractions:
|
||||||
|
outText[wordId] = self.contractions[word]
|
||||||
|
outText = " ".join(outText)
|
||||||
|
return outText
|
||||||
|
def evaluate_VQA(
|
||||||
|
model,
|
||||||
|
dataset,
|
||||||
|
model_name,
|
||||||
|
dataset_name,
|
||||||
|
time,
|
||||||
|
batch_size=1,
|
||||||
|
answer_path='./answers'
|
||||||
|
):
|
||||||
|
predictions=[]
|
||||||
|
for batch in more_itertools.chunked(
|
||||||
|
tqdm(dataset, desc="Running inference"), batch_size
|
||||||
|
):
|
||||||
|
batch = batch[0]
|
||||||
|
output = model.generate(image=batch['image_path'], question=batch['question'])
|
||||||
|
answer_dict={'question':batch['question'], 'answer':output,
|
||||||
|
'gt_answers':batch['gt_answers'], 'image_path':batch['image_path'],
|
||||||
|
'model_name':model_name}
|
||||||
|
predictions.append(answer_dict)
|
||||||
|
answer_dir = os.path.join(answer_path, time)
|
||||||
|
os.makedirs(answer_dir, exist_ok=True)
|
||||||
|
answer_path = os.path.join(answer_dir, f"{dataset_name}.json")
|
||||||
|
with open(answer_path, "w") as f:
|
||||||
|
f.write(json.dumps(predictions, indent=4))
|
||||||
|
eval = VQAEval()
|
||||||
|
correct = 0
|
||||||
|
num = 0
|
||||||
|
with open(answer_path, 'r') as f:
|
||||||
|
dict = json.load(f)
|
||||||
|
for i in range(len(dict)):
|
||||||
|
gt_answers = dict[i]['gt_answers']
|
||||||
|
answer = dict[i]['answer']
|
||||||
|
if eval.evaluate(answer,gt_answers)==1:
|
||||||
|
correct+=1
|
||||||
|
num+=1
|
||||||
|
print(f'{dataset_name}:{float(correct)/num}')
|
||||||
|
return float(correct)/num
|
||||||
|
def evaluate_OCR(
|
||||||
|
model,
|
||||||
|
dataset,
|
||||||
|
model_name,
|
||||||
|
dataset_name,
|
||||||
|
time,
|
||||||
|
question='what is written in the image?',
|
||||||
|
batch_size=1,
|
||||||
|
answer_path='./answers'
|
||||||
|
):
|
||||||
|
predictions=[]
|
||||||
|
for batch in more_itertools.chunked(
|
||||||
|
tqdm(dataset, desc="Running inference"), batch_size
|
||||||
|
):
|
||||||
|
batch = batch[0]
|
||||||
|
output = model.generate(image=batch['image_path'], question=question)
|
||||||
|
answer_dict={'question':question, 'answer':output,
|
||||||
|
'gt_answers':batch['gt_answers'], 'image_path':batch['image_path'],
|
||||||
|
'model_name':model_name}
|
||||||
|
predictions.append(answer_dict)
|
||||||
|
answer_dir = os.path.join(answer_path, time)
|
||||||
|
os.makedirs(answer_dir, exist_ok=True)
|
||||||
|
answer_path = os.path.join(answer_dir, f"{dataset_name}.json")
|
||||||
|
with open(answer_path, "w") as f:
|
||||||
|
f.write(json.dumps(predictions, indent=4))
|
||||||
|
correct = 0
|
||||||
|
num = 0
|
||||||
|
with open(answer_path, 'r') as f:
|
||||||
|
dict = json.load(f)
|
||||||
|
for i in range(len(dict)):
|
||||||
|
gt_answers = dict[i]['gt_answers']
|
||||||
|
answer = dict[i]['answer']
|
||||||
|
gt_answers = remove_special_chars(gt_answers).lower()
|
||||||
|
answer = remove_special_chars(answer).lower()
|
||||||
|
if has_word(answer, gt_answers):
|
||||||
|
correct+=1
|
||||||
|
num+=1
|
||||||
|
print(f'{dataset_name}:{float(correct)/num}')
|
||||||
|
return float(correct)/num
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description="Demo")
|
||||||
|
#OCR datasets
|
||||||
|
parser.add_argument("--ocr_dir_path", type=str, default="./data")
|
||||||
|
parser.add_argument("--ocr_dataset_name", type=str, default="IIIT5K svt IC13_857 IC15_1811 svtp ct80 cocotext ctw totaltext HOST WOST WordArt")
|
||||||
|
#textVQA
|
||||||
|
parser.add_argument("--textVQA_image_dir_path", type=str, default="./data/textVQA/train_images")
|
||||||
|
parser.add_argument("--textVQA_ann_path", type=str, default="./data/textVQA/TextVQA_0.5.1_val.json")
|
||||||
|
|
||||||
|
#docVQA
|
||||||
|
parser.add_argument("--docVQA_image_dir_path", type=str, default="./data/docVQA/val")
|
||||||
|
parser.add_argument("--docVQA_ann_path", type=str, default="./data/docVQA/val/val_v1.0.json")
|
||||||
|
|
||||||
|
#ocrVQA
|
||||||
|
parser.add_argument("--ocrVQA_image_dir_path", type=str, default="./data/ocrVQA/images")
|
||||||
|
parser.add_argument("--ocrVQA_ann_path", type=str, default="./data/ocrVQA/dataset.json")
|
||||||
|
|
||||||
|
#STVQA
|
||||||
|
parser.add_argument("--STVQA_image_dir_path", type=str, default="./data/STVQA")
|
||||||
|
parser.add_argument("--STVQA_ann_path", type=str, default="./data/STVQA/train_task_3.json")
|
||||||
|
|
||||||
|
#result_path
|
||||||
|
parser.add_argument("--answer_path", type=str, default="./answers")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--eval_textVQA",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Whether to evaluate on textVQA."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--eval_docVQA",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Whether to evaluate on docVQA."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--eval_ocrVQA",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Whether to evaluate on ocrVQA."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--eval_STVQA",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Whether to evaluate on STVQA."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--eval_ocr",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Whether to evaluate on ocr."
|
||||||
|
)
|
||||||
|
#BLIP2
|
||||||
|
#parser.add_argument("--BLIP2_model_path", type=str, default="/home/zhangli/GPT4/BLIP2-flant5")
|
||||||
|
parser.add_argument("--BLIP2_model_name", type=str, default="blip2_opt")#blip2_t5 blip2_opt blip2_vicuna_instruct
|
||||||
|
parser.add_argument("--BLIP2_model_type", type=str, default="pretrain_opt6.7b")#pretrain_flant5xxl pretrain_opt6.7b vicuna13b
|
||||||
|
|
||||||
|
|
||||||
|
parser.add_argument("--model_name", type=str, default="BLIP2")#mPLUG,miniGPT4,LLaVA
|
||||||
|
parser.add_argument("--device", type=str, default="cuda:2")
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
np.random.seed(0)
|
||||||
|
max_sample_num = 5000
|
||||||
|
model = get_model(args)
|
||||||
|
'''ocr_dataset_name=['IIIT5K','svt','IC13_857','IC15_1811','svtp','ct80',
|
||||||
|
'cocotext','ctw','totaltext','HOST','WOST','WordArt']'''
|
||||||
|
ocr_dataset_name = args.ocr_dataset_name.split()
|
||||||
|
result = {}
|
||||||
|
time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
||||||
|
if args.eval_textVQA:
|
||||||
|
dataset = textVQADataset(args.textVQA_image_dir_path, args.textVQA_ann_path)
|
||||||
|
acc = evaluate_VQA(model, dataset, args.model_name, 'textVQA', time)
|
||||||
|
result['textVQA'] = acc
|
||||||
|
if args.eval_docVQA:
|
||||||
|
dataset = docVQADataset(args.docVQA_image_dir_path, args.docVQA_ann_path)
|
||||||
|
acc = evaluate_VQA(model, dataset, args.model_name, 'docVQA', time)
|
||||||
|
result['docVQA'] = acc
|
||||||
|
|
||||||
|
if args.eval_ocrVQA:
|
||||||
|
dataset = ocrVQADataset(args.ocrVQA_image_dir_path, args.ocrVQA_ann_path)
|
||||||
|
random_indices = np.random.choice(
|
||||||
|
len(dataset), max_sample_num, replace=False
|
||||||
|
)
|
||||||
|
dataset = torch.utils.data.Subset(dataset,random_indices)
|
||||||
|
acc = evaluate_VQA(model, dataset, args.model_name, 'ocrVQA', time)
|
||||||
|
result['ocrVQA'] = acc
|
||||||
|
|
||||||
|
if args.eval_STVQA:
|
||||||
|
dataset = STVQADataset(args.STVQA_image_dir_path, args.STVQA_ann_path)
|
||||||
|
random_indices = np.random.choice(
|
||||||
|
len(dataset), max_sample_num, replace=False
|
||||||
|
)
|
||||||
|
dataset = torch.utils.data.Subset(dataset,random_indices)
|
||||||
|
acc = evaluate_VQA(model, dataset, args.model_name, 'STVQA', time)
|
||||||
|
result['STVQA'] = acc
|
||||||
|
|
||||||
|
if args.eval_ocr:
|
||||||
|
for i in range(len(ocr_dataset_name)):
|
||||||
|
dataset = ocrDataset(args.ocr_dir_path, ocr_dataset_name[i])
|
||||||
|
acc = evaluate_OCR(model, dataset, args.model_name, ocr_dataset_name[i], time)
|
||||||
|
result[ocr_dataset_name[i]] = acc
|
||||||
|
result_path = os.path.join(os.path.join(args.answer_path, time), 'result.json')
|
||||||
|
with open(result_path, "w") as f:
|
||||||
|
f.write(json.dumps(result, indent=4))
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
main(args)
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
from transformers import Blip2Processor, Blip2ForConditionalGeneration
|
||||||
|
import torch
|
||||||
|
from PIL import Image
|
||||||
|
from ..process import pad_image
|
||||||
|
#There are some issues with the Hugging Face version of the BLIP2-opt model.
|
||||||
|
class BLIP2:
|
||||||
|
def __init__(self, model_path, device = "cuda") -> None:
|
||||||
|
self.processor = Blip2Processor.from_pretrained(model_path)
|
||||||
|
self.model = Blip2ForConditionalGeneration.from_pretrained(
|
||||||
|
model_path, torch_dtype=torch.float16).to(device)
|
||||||
|
self.model.eval()
|
||||||
|
self.device = device
|
||||||
|
def generate(self, image, question, pad=True):
|
||||||
|
prompt =f'Question: {question} Answer:'
|
||||||
|
image = Image.open(image)
|
||||||
|
if pad:
|
||||||
|
image = pad_image(image, (224,224))
|
||||||
|
inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device, torch.float16)
|
||||||
|
generated_ids = self.model.generate(**inputs)
|
||||||
|
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
||||||
|
return generated_text
|
||||||
Binary file not shown.
@@ -0,0 +1,47 @@
|
|||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
|
||||||
|
from transformers import CLIPVisionModel, CLIPImageProcessor, StoppingCriteria
|
||||||
|
import torch
|
||||||
|
DEFAULT_IMAGE_TOKEN = "<image>"
|
||||||
|
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
|
||||||
|
DEFAULT_IM_START_TOKEN = "<im_start>"
|
||||||
|
DEFAULT_IM_END_TOKEN = "<im_end>"
|
||||||
|
def disable_torch_init():
|
||||||
|
"""
|
||||||
|
Disable the redundant torch default initialization to accelerate model creation.
|
||||||
|
"""
|
||||||
|
setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
|
||||||
|
setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
|
||||||
|
def patch_config(config):
|
||||||
|
patch_dict = {
|
||||||
|
"use_mm_proj": True,
|
||||||
|
"mm_vision_tower": "openai/clip-vit-large-patch14",
|
||||||
|
"mm_hidden_size": 1024
|
||||||
|
}
|
||||||
|
cfg = AutoConfig.from_pretrained(config)
|
||||||
|
if not hasattr(cfg, "mm_vision_tower"):
|
||||||
|
print(f'`mm_vision_tower` not found in `{config}`, applying patch and save to disk.')
|
||||||
|
for k, v in patch_dict.items():
|
||||||
|
setattr(cfg, k, v)
|
||||||
|
cfg.save_pretrained(config)
|
||||||
|
class LLaVA:
|
||||||
|
def __init__(self, model_path) -> None:
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||||
|
patch_config(model_path)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16).cuda()
|
||||||
|
image_processor = CLIPImageProcessor.from_pretrained(model.config.mm_vision_tower, torch_dtype=torch.float16)
|
||||||
|
|
||||||
|
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
|
||||||
|
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
|
||||||
|
if mm_use_im_start_end:
|
||||||
|
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
|
||||||
|
|
||||||
|
vision_tower = model.model.vision_tower[0]
|
||||||
|
vision_tower.to(device='cuda', dtype=torch.float16)
|
||||||
|
vision_config = vision_tower.config
|
||||||
|
vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]
|
||||||
|
vision_config.use_im_start_end = mm_use_im_start_end
|
||||||
|
if mm_use_im_start_end:
|
||||||
|
vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
|
||||||
|
image_token_len = (vision_config.image_size // vision_config.patch_size) ** 2
|
||||||
|
def generate(self, image, question):
|
||||||
|
|
||||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,20 @@
|
|||||||
|
import torch
|
||||||
|
from PIL import Image
|
||||||
|
from lavis.models import load_model_and_preprocess
|
||||||
|
from ..process import pad_image
|
||||||
|
class lavis:
|
||||||
|
def __init__(self, model_name, model_type, device) -> None:
|
||||||
|
model, vis_processors, txt_processors = load_model_and_preprocess(name = model_name, model_type = model_type, is_eval=True, device=device)
|
||||||
|
self.model = model
|
||||||
|
self.vis_processors = vis_processors
|
||||||
|
self.txt_processors = txt_processors
|
||||||
|
self.device = device
|
||||||
|
def generate(self, image, question, pad=True):
|
||||||
|
prompt = f'Question: {question} Short answer:'
|
||||||
|
image = Image.open(image).convert("RGB")
|
||||||
|
if pad:
|
||||||
|
image = pad_image(image, (224,224))
|
||||||
|
image = self.vis_processors["eval"](image).unsqueeze(0).to(self.device)
|
||||||
|
prompt = self.txt_processors["eval"](prompt)
|
||||||
|
answer = self.model.predict_answers(samples={"image": image, "text_input": prompt}, inference_method="generate", max_len=32)[0]
|
||||||
|
return answer
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,454 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" CLIP model configuration"""
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import os
|
||||||
|
from collections import OrderedDict
|
||||||
|
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from transformers.processing_utils import ProcessorMixin
|
||||||
|
from transformers.utils import TensorType
|
||||||
|
|
||||||
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
|
from transformers.onnx import OnnxConfig
|
||||||
|
from transformers.utils import logging
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
|
"openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/config.json",
|
||||||
|
# See all CLIP models at https://huggingface.co/models?filter=clip
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class CLIPTextConfig(PretrainedConfig):
|
||||||
|
r"""
|
||||||
|
This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
|
||||||
|
text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||||
|
with the defaults will yield a similar configuration to that of the text encoder of the CLIP
|
||||||
|
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
|
||||||
|
|
||||||
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_size (`int`, *optional*, defaults to 49408):
|
||||||
|
Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
|
||||||
|
the `inputs_ids` passed when calling [`CLIPModel`].
|
||||||
|
hidden_size (`int`, *optional*, defaults to 512):
|
||||||
|
Dimensionality of the encoder layers and the pooler layer.
|
||||||
|
intermediate_size (`int`, *optional*, defaults to 2048):
|
||||||
|
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||||
|
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||||
|
Number of hidden layers in the Transformer encoder.
|
||||||
|
num_attention_heads (`int`, *optional*, defaults to 8):
|
||||||
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
|
max_position_embeddings (`int`, *optional*, defaults to 77):
|
||||||
|
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||||
|
just in case (e.g., 512 or 1024 or 2048).
|
||||||
|
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||||
|
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||||
|
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
||||||
|
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||||
|
The epsilon used by the layer normalization layers.
|
||||||
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
|
The dropout ratio for the attention probabilities.
|
||||||
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
initializer_factor (`float`, *optional*, defaults to 1):
|
||||||
|
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
||||||
|
testing).
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from transformers import CLIPTextConfig, CLIPTextModel
|
||||||
|
|
||||||
|
>>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
|
||||||
|
>>> configuration = CLIPTextConfig()
|
||||||
|
|
||||||
|
>>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
|
||||||
|
>>> model = CLIPTextModel(configuration)
|
||||||
|
|
||||||
|
>>> # Accessing the model configuration
|
||||||
|
>>> configuration = model.config
|
||||||
|
```"""
|
||||||
|
model_type = "clip_text_model"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size=49408,
|
||||||
|
hidden_size=512,
|
||||||
|
intermediate_size=2048,
|
||||||
|
projection_dim=512,
|
||||||
|
num_hidden_layers=12,
|
||||||
|
num_attention_heads=8,
|
||||||
|
max_position_embeddings=77,
|
||||||
|
hidden_act="quick_gelu",
|
||||||
|
layer_norm_eps=1e-6,
|
||||||
|
attention_dropout=0.0,
|
||||||
|
initializer_range=0.02,
|
||||||
|
initializer_factor=1.0,
|
||||||
|
pad_token_id=1,
|
||||||
|
bos_token_id=0,
|
||||||
|
eos_token_id=2,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||||
|
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.projection_dim = projection_dim
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.layer_norm_eps = layer_norm_eps
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.initializer_factor = initializer_factor
|
||||||
|
self.attention_dropout = attention_dropout
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||||
|
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||||
|
|
||||||
|
# get the text config dict if we are loading from CLIPConfig
|
||||||
|
if config_dict.get("model_type") == "clip":
|
||||||
|
config_dict = config_dict["text_config"]
|
||||||
|
|
||||||
|
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||||
|
logger.warning(
|
||||||
|
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||||
|
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||||
|
)
|
||||||
|
|
||||||
|
return cls.from_dict(config_dict, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class CLIPVisionConfig(PretrainedConfig):
|
||||||
|
r"""
|
||||||
|
This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
|
||||||
|
CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
|
||||||
|
configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
|
||||||
|
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
|
||||||
|
|
||||||
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hidden_size (`int`, *optional*, defaults to 768):
|
||||||
|
Dimensionality of the encoder layers and the pooler layer.
|
||||||
|
intermediate_size (`int`, *optional*, defaults to 3072):
|
||||||
|
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||||
|
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||||
|
Number of hidden layers in the Transformer encoder.
|
||||||
|
num_attention_heads (`int`, *optional*, defaults to 12):
|
||||||
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
|
image_size (`int`, *optional*, defaults to 224):
|
||||||
|
The size (resolution) of each image.
|
||||||
|
patch_size (`int`, *optional*, defaults to 32):
|
||||||
|
The size (resolution) of each patch.
|
||||||
|
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||||
|
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||||
|
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||||
|
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||||
|
The epsilon used by the layer normalization layers.
|
||||||
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
|
The dropout ratio for the attention probabilities.
|
||||||
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
initializer_factor (`float`, *optional*, defaults to 1):
|
||||||
|
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
||||||
|
testing).
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from transformers import CLIPVisionConfig, CLIPVisionModel
|
||||||
|
|
||||||
|
>>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
|
||||||
|
>>> configuration = CLIPVisionConfig()
|
||||||
|
|
||||||
|
>>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
|
||||||
|
>>> model = CLIPVisionModel(configuration)
|
||||||
|
|
||||||
|
>>> # Accessing the model configuration
|
||||||
|
>>> configuration = model.config
|
||||||
|
```"""
|
||||||
|
|
||||||
|
model_type = "clip_vision_model"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hidden_size=768,
|
||||||
|
intermediate_size=3072,
|
||||||
|
projection_dim=512,
|
||||||
|
num_hidden_layers=12,
|
||||||
|
num_attention_heads=12,
|
||||||
|
num_channels=3,
|
||||||
|
image_size=224,
|
||||||
|
patch_size=32,
|
||||||
|
hidden_act="quick_gelu",
|
||||||
|
layer_norm_eps=1e-5,
|
||||||
|
attention_dropout=0.0,
|
||||||
|
initializer_range=0.02,
|
||||||
|
initializer_factor=1.0,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.projection_dim = projection_dim
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.num_channels = num_channels
|
||||||
|
self.patch_size = patch_size
|
||||||
|
self.image_size = image_size
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.initializer_factor = initializer_factor
|
||||||
|
self.attention_dropout = attention_dropout
|
||||||
|
self.layer_norm_eps = layer_norm_eps
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||||
|
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||||
|
|
||||||
|
# get the vision config dict if we are loading from CLIPConfig
|
||||||
|
if config_dict.get("model_type") == "clip":
|
||||||
|
config_dict = config_dict["vision_config"]
|
||||||
|
|
||||||
|
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||||
|
logger.warning(
|
||||||
|
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||||
|
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||||
|
)
|
||||||
|
|
||||||
|
return cls.from_dict(config_dict, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class CLIPConfig(PretrainedConfig):
|
||||||
|
r"""
|
||||||
|
[`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
|
||||||
|
a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
|
||||||
|
a configuration with the defaults will yield a similar configuration to that of the CLIP
|
||||||
|
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
|
||||||
|
|
||||||
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||||
|
documentation from [`PretrainedConfig`] for more information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text_config (`dict`, *optional*):
|
||||||
|
Dictionary of configuration options used to initialize [`CLIPTextConfig`].
|
||||||
|
vision_config (`dict`, *optional*):
|
||||||
|
Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
|
||||||
|
projection_dim (`int`, *optional*, defaults to 512):
|
||||||
|
Dimentionality of text and vision projection layers.
|
||||||
|
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
|
||||||
|
The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
|
||||||
|
kwargs (*optional*):
|
||||||
|
Dictionary of keyword arguments.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
>>> from transformers import CLIPConfig, CLIPModel
|
||||||
|
|
||||||
|
>>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
|
||||||
|
>>> configuration = CLIPConfig()
|
||||||
|
|
||||||
|
>>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
|
||||||
|
>>> model = CLIPModel(configuration)
|
||||||
|
|
||||||
|
>>> # Accessing the model configuration
|
||||||
|
>>> configuration = model.config
|
||||||
|
|
||||||
|
>>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
|
||||||
|
>>> from transformers import CLIPTextConfig, CLIPVisionConfig
|
||||||
|
|
||||||
|
>>> # Initializing a CLIPText and CLIPVision configuration
|
||||||
|
>>> config_text = CLIPTextConfig()
|
||||||
|
>>> config_vision = CLIPVisionConfig()
|
||||||
|
|
||||||
|
>>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
|
||||||
|
```"""
|
||||||
|
|
||||||
|
model_type = "clip"
|
||||||
|
is_composition = True
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
|
||||||
|
):
|
||||||
|
# If `_config_dict` exist, we use them for the backward compatibility.
|
||||||
|
# We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
|
||||||
|
# of confusion!).
|
||||||
|
text_config_dict = kwargs.pop("text_config_dict", None)
|
||||||
|
vision_config_dict = kwargs.pop("vision_config_dict", None)
|
||||||
|
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
# Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
|
||||||
|
# `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
|
||||||
|
# cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
|
||||||
|
if text_config_dict is not None:
|
||||||
|
if text_config is None:
|
||||||
|
text_config = {}
|
||||||
|
|
||||||
|
# This is the complete result when using `text_config_dict`.
|
||||||
|
_text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
|
||||||
|
|
||||||
|
# Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
|
||||||
|
for key, value in _text_config_dict.items():
|
||||||
|
if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
|
||||||
|
# If specified in `text_config_dict`
|
||||||
|
if key in text_config_dict:
|
||||||
|
message = (
|
||||||
|
f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
|
||||||
|
f'The value `text_config_dict["{key}"]` will be used instead.'
|
||||||
|
)
|
||||||
|
# If inferred from default argument values (just to be super careful)
|
||||||
|
else:
|
||||||
|
message = (
|
||||||
|
f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
|
||||||
|
f'value `text_config["{key}"]` will be overriden.'
|
||||||
|
)
|
||||||
|
logger.warning(message)
|
||||||
|
|
||||||
|
# Update all values in `text_config` with the ones in `_text_config_dict`.
|
||||||
|
text_config.update(_text_config_dict)
|
||||||
|
|
||||||
|
if vision_config_dict is not None:
|
||||||
|
if vision_config is None:
|
||||||
|
vision_config = {}
|
||||||
|
|
||||||
|
# This is the complete result when using `vision_config_dict`.
|
||||||
|
_vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
|
||||||
|
# convert keys to string instead of integer
|
||||||
|
if "id2label" in _vision_config_dict:
|
||||||
|
_vision_config_dict["id2label"] = {
|
||||||
|
str(key): value for key, value in _vision_config_dict["id2label"].items()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
|
||||||
|
for key, value in _vision_config_dict.items():
|
||||||
|
if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
|
||||||
|
# If specified in `vision_config_dict`
|
||||||
|
if key in vision_config_dict:
|
||||||
|
message = (
|
||||||
|
f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
|
||||||
|
f'values. The value `vision_config_dict["{key}"]` will be used instead.'
|
||||||
|
)
|
||||||
|
# If inferred from default argument values (just to be super careful)
|
||||||
|
else:
|
||||||
|
message = (
|
||||||
|
f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
|
||||||
|
f'The value `vision_config["{key}"]` will be overriden.'
|
||||||
|
)
|
||||||
|
logger.warning(message)
|
||||||
|
|
||||||
|
# Update all values in `vision_config` with the ones in `_vision_config_dict`.
|
||||||
|
vision_config.update(_vision_config_dict)
|
||||||
|
|
||||||
|
if text_config is None:
|
||||||
|
text_config = {}
|
||||||
|
logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
|
||||||
|
|
||||||
|
if vision_config is None:
|
||||||
|
vision_config = {}
|
||||||
|
logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
|
||||||
|
|
||||||
|
self.text_config = CLIPTextConfig(**text_config)
|
||||||
|
self.vision_config = CLIPVisionConfig(**vision_config)
|
||||||
|
|
||||||
|
self.projection_dim = projection_dim
|
||||||
|
self.logit_scale_init_value = logit_scale_init_value
|
||||||
|
self.initializer_factor = 1.0
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
|
||||||
|
r"""
|
||||||
|
Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
|
||||||
|
configuration.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
[`CLIPConfig`]: An instance of a configuration object
|
||||||
|
"""
|
||||||
|
|
||||||
|
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
"""
|
||||||
|
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
|
||||||
|
"""
|
||||||
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
output["text_config"] = self.text_config.to_dict()
|
||||||
|
output["vision_config"] = self.vision_config.to_dict()
|
||||||
|
output["model_type"] = self.__class__.model_type
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class CLIPOnnxConfig(OnnxConfig):
|
||||||
|
@property
|
||||||
|
def inputs(self) -> Mapping[str, Mapping[int, str]]:
|
||||||
|
return OrderedDict(
|
||||||
|
[
|
||||||
|
("input_ids", {0: "batch", 1: "sequence"}),
|
||||||
|
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
|
||||||
|
("attention_mask", {0: "batch", 1: "sequence"}),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def outputs(self) -> Mapping[str, Mapping[int, str]]:
|
||||||
|
return OrderedDict(
|
||||||
|
[
|
||||||
|
("logits_per_image", {0: "batch"}),
|
||||||
|
("logits_per_text", {0: "batch"}),
|
||||||
|
("text_embeds", {0: "batch"}),
|
||||||
|
("image_embeds", {0: "batch"}),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def atol_for_validation(self) -> float:
|
||||||
|
return 1e-4
|
||||||
|
|
||||||
|
def generate_dummy_inputs(
|
||||||
|
self,
|
||||||
|
processor: "ProcessorMixin",
|
||||||
|
batch_size: int = -1,
|
||||||
|
seq_length: int = -1,
|
||||||
|
framework: Optional["TensorType"] = None,
|
||||||
|
) -> Mapping[str, Any]:
|
||||||
|
text_input_dict = super().generate_dummy_inputs(
|
||||||
|
processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
|
||||||
|
)
|
||||||
|
image_input_dict = super().generate_dummy_inputs(
|
||||||
|
processor.feature_extractor, batch_size=batch_size, framework=framework
|
||||||
|
)
|
||||||
|
return {**text_input_dict, **image_input_dict}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def default_onnx_opset(self) -> int:
|
||||||
|
return 14
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,44 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import torch
|
||||||
|
from transformers.models.llama.configuration_llama import LlamaConfig
|
||||||
|
from mplug_owl.configuration_mplug_owl import mPLUG_OwlConfig
|
||||||
|
from mplug_owl.modeling_mplug_owl import mPLUG_OwlForConditionalGeneration
|
||||||
|
from transformers.models.llama.tokenization_llama import LlamaTokenizer
|
||||||
|
from mplug_owl.modeling_mplug_owl import ImageProcessor
|
||||||
|
from mplug_owl.tokenize_utils import tokenize_prompts
|
||||||
|
class mPLUG:
|
||||||
|
def __init__(self, checkpoint_path=None, tokenizer_path=None) -> None:
|
||||||
|
config = mPLUG_OwlConfig()
|
||||||
|
self.model = mPLUG_OwlForConditionalGeneration(config=config).to(torch.bfloat16)
|
||||||
|
self.model.eval()
|
||||||
|
|
||||||
|
if checkpoint_path is not None:
|
||||||
|
tmp_ckpt = torch.load(
|
||||||
|
checkpoint_path, map_location='cpu')
|
||||||
|
msg = self.model.load_state_dict(tmp_ckpt, strict=False)
|
||||||
|
print(msg)
|
||||||
|
|
||||||
|
assert tokenizer_path is not None
|
||||||
|
self.tokenizer = LlamaTokenizer(
|
||||||
|
tokenizer_path, pad_token='<unk>', add_bos_token=False)
|
||||||
|
self.img_processor = ImageProcessor()
|
||||||
|
def generate(self, image, question, max_length=512, top_k=1, do_sample=True, **generate_kwargs):
|
||||||
|
prompts = [
|
||||||
|
f'''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
|
||||||
|
Human: <image>
|
||||||
|
Human: {question}
|
||||||
|
AI: ''']
|
||||||
|
tokens_to_generate = 0
|
||||||
|
add_BOS = True
|
||||||
|
context_tokens_tensor, context_length_tensorm, attention_mask = tokenize_prompts(
|
||||||
|
prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS, tokenizer=self.tokenizer, ignore_dist=True)
|
||||||
|
images = self.img_processor(image).to(torch.bfloat16).cuda()
|
||||||
|
context_tokens_tensor = context_tokens_tensor.cuda()
|
||||||
|
self.model.eval()
|
||||||
|
with torch.no_grad():
|
||||||
|
res = self.model.generate(input_ids=context_tokens_tensor, pixel_values=images,
|
||||||
|
attention_mask=attention_mask, max_lengt=max_length,top_k=top_k,do_sample=do_sample,**generate_kwargs)
|
||||||
|
sentence = self.tokenizer.decode(res.tolist()[0], skip_special_tokens=True)
|
||||||
|
return sentence
|
||||||
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,154 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2023 Alibaba Inc. and The HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import os
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
|
from transformers.models.auto import CONFIG_MAPPING
|
||||||
|
from transformers.models.auto.modeling_auto import \
|
||||||
|
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
||||||
|
from transformers.utils import logging
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class mPLUG_OwlVisualAbstractorConfig(PretrainedConfig):
|
||||||
|
|
||||||
|
model_type = "mPLUG_OwlVisualAbstractor"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size=30522,
|
||||||
|
hidden_size=1024,
|
||||||
|
num_hidden_layers=6,
|
||||||
|
num_attention_heads=8,
|
||||||
|
intermediate_size=3072,
|
||||||
|
hidden_act="gelu",
|
||||||
|
hidden_dropout_prob=0.1,
|
||||||
|
attention_probs_dropout_prob=0.1,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
initializer_range=0.02,
|
||||||
|
layer_norm_eps=1e-5,
|
||||||
|
pad_token_id=0,
|
||||||
|
position_embedding_type="absolute",
|
||||||
|
classifier_dropout=None,
|
||||||
|
cross_attention_frequency=2,
|
||||||
|
encoder_hidden_size=1024,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
||||||
|
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.hidden_dropout_prob = hidden_dropout_prob
|
||||||
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.layer_norm_eps = layer_norm_eps
|
||||||
|
self.position_embedding_type = position_embedding_type
|
||||||
|
self.classifier_dropout = classifier_dropout
|
||||||
|
self.cross_attention_frequency = cross_attention_frequency
|
||||||
|
self.encoder_hidden_size = encoder_hidden_size
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
||||||
|
config_dict, kwargs = cls.get_config_dict(
|
||||||
|
pretrained_model_name_or_path, **kwargs)
|
||||||
|
|
||||||
|
if config_dict.get("model_type") == "mplug_owl":
|
||||||
|
config_dict = config_dict["abstractor_config"]
|
||||||
|
|
||||||
|
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
||||||
|
logger.warning(
|
||||||
|
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||||
|
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
||||||
|
)
|
||||||
|
|
||||||
|
return cls.from_dict(config_dict, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class mPLUG_OwlConfig(PretrainedConfig):
|
||||||
|
|
||||||
|
model_type = "mplug_owl"
|
||||||
|
is_composition = True
|
||||||
|
|
||||||
|
def __init__(self, vision_config=None, visual_abstractor_config=None, text_config=None, num_query_tokens=64, **kwargs):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
from clip.configuration_clip import CLIPVisionConfig
|
||||||
|
if vision_config is None:
|
||||||
|
# By defalt we use openai-clip large patch14
|
||||||
|
|
||||||
|
vision_config = CLIPVisionConfig(
|
||||||
|
**vision_config_dict, layer_norm_eps=1e-6).to_dict()
|
||||||
|
logger.info(
|
||||||
|
"vision_config is None.")
|
||||||
|
|
||||||
|
if visual_abstractor_config is None:
|
||||||
|
visual_abstractor_config = {}
|
||||||
|
logger.info(
|
||||||
|
"abstractor_config is None. ")
|
||||||
|
|
||||||
|
if text_config is None:
|
||||||
|
# we use LLAMA 7b by default
|
||||||
|
from transformers.models.llama.configuration_llama import \
|
||||||
|
LlamaConfig
|
||||||
|
text_config = LlamaConfig(pad_token_id=2).to_dict()
|
||||||
|
logger.info("text_config is None.")
|
||||||
|
|
||||||
|
self.vision_config = CLIPVisionConfig(**vision_config)
|
||||||
|
self.visual_abstractor_config = mPLUG_OwlVisualAbstractorConfig(
|
||||||
|
**visual_abstractor_config)
|
||||||
|
self.visual_abstractor_config.layer_norm_eps = 1e-6
|
||||||
|
text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
|
||||||
|
self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
|
||||||
|
|
||||||
|
self.tie_word_embeddings = self.text_config.tie_word_embeddings
|
||||||
|
self.is_encoder_decoder = self.text_config.is_encoder_decoder
|
||||||
|
|
||||||
|
self.num_query_tokens = num_query_tokens
|
||||||
|
self.visual_abstractor_config.encoder_hidden_size = self.vision_config.hidden_size
|
||||||
|
self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
||||||
|
self.initializer_factor = 1.0
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
"""
|
||||||
|
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
|
||||||
|
"""
|
||||||
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
output["vision_config"] = self.vision_config.to_dict()
|
||||||
|
output["abstractor_config"] = self.visual_abstractor_config.to_dict()
|
||||||
|
output["text_config"] = self.text_config.to_dict()
|
||||||
|
output["model_type"] = self.__class__.model_type
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
vision_config_dict = {
|
||||||
|
"hidden_size": 1024,
|
||||||
|
"intermediate_size": 4096,
|
||||||
|
"num_attention_heads": 8,
|
||||||
|
"num_hidden_layers": 24,
|
||||||
|
"patch_size": 14,
|
||||||
|
"projection_dim": 768}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,171 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Tokenization utilities."""
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from icecream import ic
|
||||||
|
|
||||||
|
|
||||||
|
def detokenize_generations(tokens_gpu_tensor,
|
||||||
|
lengths_gpu_tensor,
|
||||||
|
return_segments, tokenizer):
|
||||||
|
"""Detokenize the generated tokens."""
|
||||||
|
|
||||||
|
prompts_plus_generations = []
|
||||||
|
if return_segments:
|
||||||
|
prompts_plus_generations_segments = []
|
||||||
|
|
||||||
|
tokens = tokens_gpu_tensor.cpu().numpy().tolist()
|
||||||
|
lengths = lengths_gpu_tensor.cpu().numpy().tolist()
|
||||||
|
for sequence_tokens, length in zip(tokens, lengths):
|
||||||
|
sequence_tokens = sequence_tokens[:length]
|
||||||
|
prompts_plus_generations.append(
|
||||||
|
tokenizer.detokenize(sequence_tokens))
|
||||||
|
if return_segments:
|
||||||
|
from tokenizers.decoders import Metaspace
|
||||||
|
if hasattr(tokenizer, 'tokenizer'):
|
||||||
|
if isinstance(tokenizer.tokenizer.decoder, Metaspace):
|
||||||
|
words = tokenizer.tokenizer.decode(sequence_tokens)
|
||||||
|
else:
|
||||||
|
words = []
|
||||||
|
for token in sequence_tokens:
|
||||||
|
word = tokenizer.tokenizer.decoder[token]
|
||||||
|
word = bytearray(
|
||||||
|
[tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
|
||||||
|
'utf-8', errors='replace')
|
||||||
|
words.append(word)
|
||||||
|
prompts_plus_generations_segments.append(words)
|
||||||
|
else:
|
||||||
|
words = tokenizer.detokenize(sequence_tokens)
|
||||||
|
# else:
|
||||||
|
# words = []
|
||||||
|
# for token in sequence_tokens:
|
||||||
|
# word = tokenizer.tokenizer.decoder[token]
|
||||||
|
# word = bytearray(
|
||||||
|
# [tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
|
||||||
|
# 'utf-8', errors='replace')
|
||||||
|
# words.append(word)
|
||||||
|
prompts_plus_generations_segments.append(words)
|
||||||
|
|
||||||
|
if return_segments:
|
||||||
|
return tokens, prompts_plus_generations, \
|
||||||
|
prompts_plus_generations_segments
|
||||||
|
|
||||||
|
return tokens, prompts_plus_generations
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_prompts(prompts=None, tokens_to_generate=None,
|
||||||
|
add_BOS=None, rank=0, tokenizer=None, ignore_dist=False):
|
||||||
|
"""Tokenize prompts and make them avaiable on all ranks."""
|
||||||
|
|
||||||
|
# On all ranks set to None so we can pass them to functions
|
||||||
|
sizes_list = None
|
||||||
|
prompts_tokens_cuda_long_tensor = None
|
||||||
|
prompts_length_cuda_long_tensor = None
|
||||||
|
|
||||||
|
# On the specified rank, build the above.
|
||||||
|
attention_mask = None
|
||||||
|
if ignore_dist or torch.distributed.get_rank() == rank:
|
||||||
|
assert prompts is not None
|
||||||
|
assert tokens_to_generate is not None
|
||||||
|
# Tensor of tokens padded and their unpadded length.
|
||||||
|
prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor, attention_mask = \
|
||||||
|
_tokenize_prompts_and_batch(
|
||||||
|
prompts, tokens_to_generate, add_BOS, tokenizer)
|
||||||
|
# We need the sizes of these tensors for the boradcast
|
||||||
|
sizes_list = [prompts_tokens_cuda_long_tensor.size(0), # Batch size
|
||||||
|
prompts_tokens_cuda_long_tensor.size(1)] # Sequence lenght
|
||||||
|
|
||||||
|
return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor, attention_mask
|
||||||
|
|
||||||
|
|
||||||
|
def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS, tokenizer):
|
||||||
|
"""Given a set of prompts and number of tokens to generate:
|
||||||
|
- tokenize prompts
|
||||||
|
- set the sequence length to be the max of length of prompts
|
||||||
|
plus the number of tokens we would like to generate
|
||||||
|
- pad all the sequences to this length so we can convert them
|
||||||
|
into a 2D tensor.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Tokenize all the prompts.
|
||||||
|
# if add_BOS:
|
||||||
|
# prompts_tokens = [[tokenizer.bos] + tokenizer.tokenize(prompt)
|
||||||
|
# for prompt in prompts]
|
||||||
|
# else:
|
||||||
|
# prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts]
|
||||||
|
|
||||||
|
prompts_tokens = [_tokenize_prompt(
|
||||||
|
prompt, tokenizer, add_BOS) for prompt in prompts]
|
||||||
|
|
||||||
|
# Now we have a list of list of tokens which each list has a different
|
||||||
|
# size. We want to extend this list to:
|
||||||
|
# - incorporate the tokens that need to be generated
|
||||||
|
# - make all the sequences equal length.
|
||||||
|
# Get the prompts length.
|
||||||
|
prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens]
|
||||||
|
# Get the max prompts length.
|
||||||
|
max_prompt_len = max(prompts_length)
|
||||||
|
# Number of tokens in the each sample of the batch.
|
||||||
|
samples_length = max_prompt_len + tokens_to_generate
|
||||||
|
# Now update the list of list to be of the same size: samples_length.
|
||||||
|
for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length):
|
||||||
|
padding_size = samples_length - prompt_length
|
||||||
|
prompt_tokens.extend([tokenizer.eos_token_id] * padding_size)
|
||||||
|
|
||||||
|
# Now we are in a structured format, we can convert to tensors.
|
||||||
|
prompts_tokens_tensor = torch.LongTensor(prompts_tokens)
|
||||||
|
prompts_length_tensor = torch.LongTensor(prompts_length)
|
||||||
|
attention_mask = torch.zeros(prompts_tokens_tensor.shape[:2])
|
||||||
|
for i, l in enumerate(prompts_length_tensor):
|
||||||
|
attention_mask[i, :l] = 1
|
||||||
|
return prompts_tokens_tensor, prompts_length_tensor, attention_mask
|
||||||
|
|
||||||
|
|
||||||
|
def _tokenize_prompt(prompt, tokenizer, add_BOS=False, media_info={'<image>': 65}):
|
||||||
|
media_tokens = {k: -int(i+1) for i, k in enumerate(media_info.keys())}
|
||||||
|
media_lengths = media_info.copy()
|
||||||
|
|
||||||
|
if add_BOS:
|
||||||
|
prompt_chunk = [tokenizer.bos_token_id]
|
||||||
|
else:
|
||||||
|
prompt_chunk = []
|
||||||
|
|
||||||
|
# Pure Text
|
||||||
|
if all([media_token not in prompt for media_token in media_tokens.keys()]):
|
||||||
|
enc_chunk = prompt_chunk + \
|
||||||
|
tokenizer(prompt, add_special_tokens=False)['input_ids']
|
||||||
|
|
||||||
|
# Multi-Modal Text
|
||||||
|
else:
|
||||||
|
enc_chunk = prompt_chunk
|
||||||
|
pattern = '|'.join(map(re.escape, list(media_tokens.keys())))
|
||||||
|
chunk_strs = re.split(f'({pattern})', prompt)
|
||||||
|
chunk_strs = [x for x in chunk_strs if len(x) > 0]
|
||||||
|
for idx, chunk_str in enumerate(chunk_strs):
|
||||||
|
if chunk_str in media_tokens:
|
||||||
|
enc_chunk += [media_tokens[chunk_str]] * \
|
||||||
|
media_lengths[chunk_str]
|
||||||
|
else:
|
||||||
|
tmp_chunk = tokenizer(chunk_str, add_special_tokens=False)[
|
||||||
|
'input_ids']
|
||||||
|
# if idx < len(chunk_strs) - 1: # Last chunk should not have eos
|
||||||
|
# tmp_chunk += [tokenizer.eod_id]
|
||||||
|
enc_chunk += tmp_chunk
|
||||||
|
return enc_chunk
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
import PIL
|
||||||
|
def pad_image(image, target_size):
|
||||||
|
|
||||||
|
"""
|
||||||
|
:param image: input image
|
||||||
|
:param target_size: a tuple (num,num)
|
||||||
|
:return: new image
|
||||||
|
"""
|
||||||
|
|
||||||
|
iw, ih = image.size # 原始图像的尺寸
|
||||||
|
w, h = target_size # 目标图像的尺寸
|
||||||
|
|
||||||
|
scale = min(w / iw, h / ih) # 转换的最小比例
|
||||||
|
|
||||||
|
# 保证长或宽,至少一个符合目标图像的尺寸 0.5保证四舍五入
|
||||||
|
nw = int(iw * scale+0.5)
|
||||||
|
nh = int(ih * scale+0.5)
|
||||||
|
|
||||||
|
w += 128
|
||||||
|
h += 128
|
||||||
|
|
||||||
|
|
||||||
|
image = image.resize((nw, nh), PIL.Image.BICUBIC) # 更改图像尺寸,双立法插值效果很好
|
||||||
|
#image.show()
|
||||||
|
new_image = PIL.Image.new('RGB', (w, h), (0, 0, 0)) # 生成黑色图像
|
||||||
|
# // 为整数除法,计算图像的位置
|
||||||
|
new_image.paste(image, ((w - nw) // 2, (h - nh) // 2)) # 将图像填充为中间图像,两侧为黑色的样式
|
||||||
|
|
||||||
|
return new_image
|
||||||
Reference in New Issue
Block a user