Add files via upload

This commit is contained in:
lz
2023-05-17 03:38:36 +08:00
committed by GitHub
parent eb33084cb5
commit da758a9ca7
32 changed files with 145202 additions and 0 deletions
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
View File
Binary file not shown.
Binary file not shown.
Binary file not shown.
View File
+22
View File
@@ -0,0 +1,22 @@
from torch.utils.data import Dataset
import os
class ocrDataset(Dataset):
def __init__(
self,
image_dir_path= "./data/ocr",
dataset_name = "ct80"
):
self.image_dir_path = image_dir_path
self.dataset_name = dataset_name
file_path = os.path.join(image_dir_path, f'{dataset_name}/test_label.txt')
file = open(file_path, "r")
self.lines = file.readlines()
def __len__(self):
return len(self.lines)
def __getitem__(self, idx):
image_id = self.lines[idx].split()[0]
img_path = os.path.join(self.image_dir_path,f'{self.dataset_name}/{image_id}')
answers = self.lines[idx].split()[1]
return {
"image_path": img_path,
"gt_answers": answers}
+103
View File
@@ -0,0 +1,103 @@
from torch.utils.data import Dataset
import os
import json
class textVQADataset(Dataset):
def __init__(
self,
image_dir_path= "./data/textVQA/train_images",
ann_path= "./data/textVQA/TextVQA_0.5.1_val.json"
):
self.data = json.load(open(ann_path, "r"))["data"]
self.image_dir_path = image_dir_path
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
question = self.data[idx]['question']
answers = self.data[idx]['answers']
img_path = os.path.join(self.image_dir_path, f"{self.data[idx]['image_id']}.jpg")
return {
"image_path": img_path,
"question": question,
"gt_answers": answers}
class docVQADataset(Dataset):
def __init__(
self,
image_dir_path= "./data/docVQA/val",
ann_path= "./data/docVQA/val/val_v1.0.json",
):
self.data = json.load(open(ann_path, "r"))["data"]
self.image_dir_path = image_dir_path
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
question = self.data[idx]['question']
answers = self.data[idx]['answers']
img_path = os.path.join(self.image_dir_path, self.data[idx]['image'])
return {
"image_path": img_path,
"question": question,
"gt_answers": answers}
class ocrVQADataset(Dataset):
def __init__(
self,
image_dir_path= "./data/ocrVQA/images",
ann_path= "./data/ocrVQA/dataset.json",
):
self.image_list = []
self.question_list = []
self.answer_list = []
dataset = json.load(open(ann_path, "r"))
import pdb;pdb.set_trace()
for idx, data in enumerate(dataset):
questions = dataset[data]['questions']
for index, question in enumerate(questions):
image_file = os.path.join(image_dir_path, f'{data}.jpg')
gt_answers = dataset[data]['answers'][index]
self.image_list.append(image_file)
self.answer_list.append(gt_answers)
self.question_list.append(question)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
question = self.question_list[idx]
answers = self.answer_list[idx]
img_path = self.image_list[idx]
return {
"image_path": img_path,
"question": question,
"gt_answers": answers}
class STVQADataset(Dataset):
def __init__(
self,
image_dir_path= "./data/STVQA",
ann_path= "./data/STVQA/train_task_3.json",
):
self.image_list = []
self.question_list = []
self.answer_list = []
data = json.load(open(ann_path, "r"))
for i in range(len(data)):
image_path = image_dir_path+'/'+data['data'][i]['dataset']+'/'+data['data'][i]['file_name']
self.image_list.append(image_path)
self.answer_list.append(data['data'][i]['answers'])
self.question_list.append(data['data'][i]['question'])
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
question = self.question_list[idx]
answers = self.answer_list[idx]
img_path = self.image_list[idx]
return {
"image_path": img_path,
"question": question,
"gt_answers": answers}
+440
View File
@@ -0,0 +1,440 @@
import argparse
#from models.BLIP2.BLIP2 import BLIP2
import more_itertools
from tqdm import tqdm
import datetime
import os
import json
import re
from datasets.vqa_dataset import textVQADataset, docVQADataset, ocrVQADataset, STVQADataset
from datasets.ocr_dataset import ocrDataset
from models.lavis.lavis import lavis
import torch
import numpy as np
def get_model(args):
if args.model_name=='BLIP2':
#model = BLIP2(args.BLIP2_model_path, args.device)
model = lavis(args.BLIP2_model_name, args.BLIP2_model_type, args.device)
#elif args.model_name=='mPLUG-Owl':
# model =
return model
def has_word(sentence, word):
pattern = r"\b" + re.escape(word) + r"\b"
match = re.search(pattern, sentence)
if match:
return True
else:
return False
def remove_special_chars(s):
pattern = r"[^a-zA-Z0-9\s]"
s = re.sub(pattern, "", s)
return s
class VQAEval:
def __init__(self):
self.contractions = {
"aint": "ain't",
"arent": "aren't",
"cant": "can't",
"couldve": "could've",
"couldnt": "couldn't",
"couldn'tve": "couldn't've",
"couldnt've": "couldn't've",
"didnt": "didn't",
"doesnt": "doesn't",
"dont": "don't",
"hadnt": "hadn't",
"hadnt've": "hadn't've",
"hadn'tve": "hadn't've",
"hasnt": "hasn't",
"havent": "haven't",
"hed": "he'd",
"hed've": "he'd've",
"he'dve": "he'd've",
"hes": "he's",
"howd": "how'd",
"howll": "how'll",
"hows": "how's",
"Id've": "I'd've",
"I'dve": "I'd've",
"Im": "I'm",
"Ive": "I've",
"isnt": "isn't",
"itd": "it'd",
"itd've": "it'd've",
"it'dve": "it'd've",
"itll": "it'll",
"let's": "let's",
"maam": "ma'am",
"mightnt": "mightn't",
"mightnt've": "mightn't've",
"mightn'tve": "mightn't've",
"mightve": "might've",
"mustnt": "mustn't",
"mustve": "must've",
"neednt": "needn't",
"notve": "not've",
"oclock": "o'clock",
"oughtnt": "oughtn't",
"ow's'at": "'ow's'at",
"'ows'at": "'ow's'at",
"'ow'sat": "'ow's'at",
"shant": "shan't",
"shed've": "she'd've",
"she'dve": "she'd've",
"she's": "she's",
"shouldve": "should've",
"shouldnt": "shouldn't",
"shouldnt've": "shouldn't've",
"shouldn'tve": "shouldn't've",
"somebody'd": "somebodyd",
"somebodyd've": "somebody'd've",
"somebody'dve": "somebody'd've",
"somebodyll": "somebody'll",
"somebodys": "somebody's",
"someoned": "someone'd",
"someoned've": "someone'd've",
"someone'dve": "someone'd've",
"someonell": "someone'll",
"someones": "someone's",
"somethingd": "something'd",
"somethingd've": "something'd've",
"something'dve": "something'd've",
"somethingll": "something'll",
"thats": "that's",
"thered": "there'd",
"thered've": "there'd've",
"there'dve": "there'd've",
"therere": "there're",
"theres": "there's",
"theyd": "they'd",
"theyd've": "they'd've",
"they'dve": "they'd've",
"theyll": "they'll",
"theyre": "they're",
"theyve": "they've",
"twas": "'twas",
"wasnt": "wasn't",
"wed've": "we'd've",
"we'dve": "we'd've",
"weve": "we've",
"werent": "weren't",
"whatll": "what'll",
"whatre": "what're",
"whats": "what's",
"whatve": "what've",
"whens": "when's",
"whered": "where'd",
"wheres": "where's",
"whereve": "where've",
"whod": "who'd",
"whod've": "who'd've",
"who'dve": "who'd've",
"wholl": "who'll",
"whos": "who's",
"whove": "who've",
"whyll": "why'll",
"whyre": "why're",
"whys": "why's",
"wont": "won't",
"wouldve": "would've",
"wouldnt": "wouldn't",
"wouldnt've": "wouldn't've",
"wouldn'tve": "wouldn't've",
"yall": "y'all",
"yall'll": "y'all'll",
"y'allll": "y'all'll",
"yall'd've": "y'all'd've",
"y'alld've": "y'all'd've",
"y'all'dve": "y'all'd've",
"youd": "you'd",
"youd've": "you'd've",
"you'dve": "you'd've",
"youll": "you'll",
"youre": "you're",
"youve": "you've",
}
self.manualMap = {
"none": "0",
"zero": "0",
"one": "1",
"two": "2",
"three": "3",
"four": "4",
"five": "5",
"six": "6",
"seven": "7",
"eight": "8",
"nine": "9",
"ten": "10",
}
self.articles = ["a", "an", "the"]
self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
self.commaStrip = re.compile("(\d)(\,)(\d)")
self.punct = [
";",
r"/",
"[",
"]",
'"',
"{",
"}",
"(",
")",
"=",
"+",
"\\",
"_",
"-",
">",
"<",
"@",
"`",
",",
"?",
"!",
]
def evaluate(self, answer, gt_answers):
answer = answer.replace("\n", " ")
answer = answer.replace("\t", " ")
answer = answer.strip()
answer = self.processPunctuation(answer)
answer = self.processDigitArticle(answer)
if type(gt_answers)==list:
for i in range(len(gt_answers)):
gt_answers[i] = gt_answers[i].replace("\n", " ")
gt_answers[i] = gt_answers[i].replace("\t", " ")
gt_answers[i] = gt_answers[i].strip()
gt_answers[i] = self.processPunctuation(gt_answers[i])
gt_answers[i] = self.processDigitArticle(gt_answers[i])
if has_word(answer, gt_answers[i]):
return 1
return 0
else:
gt_answers = gt_answers.replace("\n", " ")
gt_answers= gt_answers.replace("\t", " ")
gt_answers = gt_answers.strip()
gt_answers = self.processPunctuation(gt_answers)
gt_answers = self.processDigitArticle(gt_answers)
if has_word(answer, gt_answers[i]):
return 1
else:
return 0
def processPunctuation(self, inText):
outText = inText
for p in self.punct:
if (p + " " in inText or " " + p in inText) or (
re.search(self.commaStrip, inText) != None
):
outText = outText.replace(p, "")
else:
outText = outText.replace(p, " ")
outText = self.periodStrip.sub("", outText, re.UNICODE)
return outText
def processDigitArticle(self, inText):
outText = []
tempText = inText.lower().split()
for word in tempText:
word = self.manualMap.setdefault(word, word)
if word not in self.articles:
outText.append(word)
else:
pass
for wordId, word in enumerate(outText):
if word in self.contractions:
outText[wordId] = self.contractions[word]
outText = " ".join(outText)
return outText
def evaluate_VQA(
model,
dataset,
model_name,
dataset_name,
time,
batch_size=1,
answer_path='./answers'
):
predictions=[]
for batch in more_itertools.chunked(
tqdm(dataset, desc="Running inference"), batch_size
):
batch = batch[0]
output = model.generate(image=batch['image_path'], question=batch['question'])
answer_dict={'question':batch['question'], 'answer':output,
'gt_answers':batch['gt_answers'], 'image_path':batch['image_path'],
'model_name':model_name}
predictions.append(answer_dict)
answer_dir = os.path.join(answer_path, time)
os.makedirs(answer_dir, exist_ok=True)
answer_path = os.path.join(answer_dir, f"{dataset_name}.json")
with open(answer_path, "w") as f:
f.write(json.dumps(predictions, indent=4))
eval = VQAEval()
correct = 0
num = 0
with open(answer_path, 'r') as f:
dict = json.load(f)
for i in range(len(dict)):
gt_answers = dict[i]['gt_answers']
answer = dict[i]['answer']
if eval.evaluate(answer,gt_answers)==1:
correct+=1
num+=1
print(f'{dataset_name}:{float(correct)/num}')
return float(correct)/num
def evaluate_OCR(
model,
dataset,
model_name,
dataset_name,
time,
question='what is written in the image?',
batch_size=1,
answer_path='./answers'
):
predictions=[]
for batch in more_itertools.chunked(
tqdm(dataset, desc="Running inference"), batch_size
):
batch = batch[0]
output = model.generate(image=batch['image_path'], question=question)
answer_dict={'question':question, 'answer':output,
'gt_answers':batch['gt_answers'], 'image_path':batch['image_path'],
'model_name':model_name}
predictions.append(answer_dict)
answer_dir = os.path.join(answer_path, time)
os.makedirs(answer_dir, exist_ok=True)
answer_path = os.path.join(answer_dir, f"{dataset_name}.json")
with open(answer_path, "w") as f:
f.write(json.dumps(predictions, indent=4))
correct = 0
num = 0
with open(answer_path, 'r') as f:
dict = json.load(f)
for i in range(len(dict)):
gt_answers = dict[i]['gt_answers']
answer = dict[i]['answer']
gt_answers = remove_special_chars(gt_answers).lower()
answer = remove_special_chars(answer).lower()
if has_word(answer, gt_answers):
correct+=1
num+=1
print(f'{dataset_name}:{float(correct)/num}')
return float(correct)/num
def parse_args():
parser = argparse.ArgumentParser(description="Demo")
#OCR datasets
parser.add_argument("--ocr_dir_path", type=str, default="./data")
parser.add_argument("--ocr_dataset_name", type=str, default="IIIT5K svt IC13_857 IC15_1811 svtp ct80 cocotext ctw totaltext HOST WOST WordArt")
#textVQA
parser.add_argument("--textVQA_image_dir_path", type=str, default="./data/textVQA/train_images")
parser.add_argument("--textVQA_ann_path", type=str, default="./data/textVQA/TextVQA_0.5.1_val.json")
#docVQA
parser.add_argument("--docVQA_image_dir_path", type=str, default="./data/docVQA/val")
parser.add_argument("--docVQA_ann_path", type=str, default="./data/docVQA/val/val_v1.0.json")
#ocrVQA
parser.add_argument("--ocrVQA_image_dir_path", type=str, default="./data/ocrVQA/images")
parser.add_argument("--ocrVQA_ann_path", type=str, default="./data/ocrVQA/dataset.json")
#STVQA
parser.add_argument("--STVQA_image_dir_path", type=str, default="./data/STVQA")
parser.add_argument("--STVQA_ann_path", type=str, default="./data/STVQA/train_task_3.json")
#result_path
parser.add_argument("--answer_path", type=str, default="./answers")
parser.add_argument(
"--eval_textVQA",
action="store_true",
default=False,
help="Whether to evaluate on textVQA."
)
parser.add_argument(
"--eval_docVQA",
action="store_true",
default=False,
help="Whether to evaluate on docVQA."
)
parser.add_argument(
"--eval_ocrVQA",
action="store_true",
default=False,
help="Whether to evaluate on ocrVQA."
)
parser.add_argument(
"--eval_STVQA",
action="store_true",
default=False,
help="Whether to evaluate on STVQA."
)
parser.add_argument(
"--eval_ocr",
action="store_true",
default=False,
help="Whether to evaluate on ocr."
)
#BLIP2
#parser.add_argument("--BLIP2_model_path", type=str, default="/home/zhangli/GPT4/BLIP2-flant5")
parser.add_argument("--BLIP2_model_name", type=str, default="blip2_opt")#blip2_t5 blip2_opt blip2_vicuna_instruct
parser.add_argument("--BLIP2_model_type", type=str, default="pretrain_opt6.7b")#pretrain_flant5xxl pretrain_opt6.7b vicuna13b
parser.add_argument("--model_name", type=str, default="BLIP2")#mPLUG,miniGPT4,LLaVA
parser.add_argument("--device", type=str, default="cuda:2")
args = parser.parse_args()
return args
def main(args):
np.random.seed(0)
max_sample_num = 5000
model = get_model(args)
'''ocr_dataset_name=['IIIT5K','svt','IC13_857','IC15_1811','svtp','ct80',
'cocotext','ctw','totaltext','HOST','WOST','WordArt']'''
ocr_dataset_name = args.ocr_dataset_name.split()
result = {}
time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
if args.eval_textVQA:
dataset = textVQADataset(args.textVQA_image_dir_path, args.textVQA_ann_path)
acc = evaluate_VQA(model, dataset, args.model_name, 'textVQA', time)
result['textVQA'] = acc
if args.eval_docVQA:
dataset = docVQADataset(args.docVQA_image_dir_path, args.docVQA_ann_path)
acc = evaluate_VQA(model, dataset, args.model_name, 'docVQA', time)
result['docVQA'] = acc
if args.eval_ocrVQA:
dataset = ocrVQADataset(args.ocrVQA_image_dir_path, args.ocrVQA_ann_path)
random_indices = np.random.choice(
len(dataset), max_sample_num, replace=False
)
dataset = torch.utils.data.Subset(dataset,random_indices)
acc = evaluate_VQA(model, dataset, args.model_name, 'ocrVQA', time)
result['ocrVQA'] = acc
if args.eval_STVQA:
dataset = STVQADataset(args.STVQA_image_dir_path, args.STVQA_ann_path)
random_indices = np.random.choice(
len(dataset), max_sample_num, replace=False
)
dataset = torch.utils.data.Subset(dataset,random_indices)
acc = evaluate_VQA(model, dataset, args.model_name, 'STVQA', time)
result['STVQA'] = acc
if args.eval_ocr:
for i in range(len(ocr_dataset_name)):
dataset = ocrDataset(args.ocr_dir_path, ocr_dataset_name[i])
acc = evaluate_OCR(model, dataset, args.model_name, ocr_dataset_name[i], time)
result[ocr_dataset_name[i]] = acc
result_path = os.path.join(os.path.join(args.answer_path, time), 'result.json')
with open(result_path, "w") as f:
f.write(json.dumps(result, indent=4))
if __name__ == "__main__":
args = parse_args()
main(args)
+21
View File
@@ -0,0 +1,21 @@
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
from PIL import Image
from ..process import pad_image
#There are some issues with the Hugging Face version of the BLIP2-opt model.
class BLIP2:
def __init__(self, model_path, device = "cuda") -> None:
self.processor = Blip2Processor.from_pretrained(model_path)
self.model = Blip2ForConditionalGeneration.from_pretrained(
model_path, torch_dtype=torch.float16).to(device)
self.model.eval()
self.device = device
def generate(self, image, question, pad=True):
prompt =f'Question: {question} Answer:'
image = Image.open(image)
if pad:
image = pad_image(image, (224,224))
inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device, torch.float16)
generated_ids = self.model.generate(**inputs)
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
return generated_text
Binary file not shown.
+47
View File
@@ -0,0 +1,47 @@
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from transformers import CLIPVisionModel, CLIPImageProcessor, StoppingCriteria
import torch
DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"
def disable_torch_init():
"""
Disable the redundant torch default initialization to accelerate model creation.
"""
setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
def patch_config(config):
patch_dict = {
"use_mm_proj": True,
"mm_vision_tower": "openai/clip-vit-large-patch14",
"mm_hidden_size": 1024
}
cfg = AutoConfig.from_pretrained(config)
if not hasattr(cfg, "mm_vision_tower"):
print(f'`mm_vision_tower` not found in `{config}`, applying patch and save to disk.')
for k, v in patch_dict.items():
setattr(cfg, k, v)
cfg.save_pretrained(config)
class LLaVA:
def __init__(self, model_path) -> None:
tokenizer = AutoTokenizer.from_pretrained(model_path)
patch_config(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16).cuda()
image_processor = CLIPImageProcessor.from_pretrained(model.config.mm_vision_tower, torch_dtype=torch.float16)
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
vision_tower = model.model.vision_tower[0]
vision_tower.to(device='cuda', dtype=torch.float16)
vision_config = vision_tower.config
vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]
vision_config.use_im_start_end = mm_use_im_start_end
if mm_use_im_start_end:
vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
image_token_len = (vision_config.image_size // vision_config.patch_size) ** 2
def generate(self, image, question):
Binary file not shown.
Binary file not shown.
+20
View File
@@ -0,0 +1,20 @@
import torch
from PIL import Image
from lavis.models import load_model_and_preprocess
from ..process import pad_image
class lavis:
def __init__(self, model_name, model_type, device) -> None:
model, vis_processors, txt_processors = load_model_and_preprocess(name = model_name, model_type = model_type, is_eval=True, device=device)
self.model = model
self.vis_processors = vis_processors
self.txt_processors = txt_processors
self.device = device
def generate(self, image, question, pad=True):
prompt = f'Question: {question} Short answer:'
image = Image.open(image).convert("RGB")
if pad:
image = pad_image(image, (224,224))
image = self.vis_processors["eval"](image).unsqueeze(0).to(self.device)
prompt = self.txt_processors["eval"](prompt)
answer = self.model.predict_answers(samples={"image": image, "text_input": prompt}, inference_method="generate", max_len=32)[0]
return answer
View File
+454
View File
@@ -0,0 +1,454 @@
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" CLIP model configuration"""
import copy
import os
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
if TYPE_CHECKING:
from transformers.processing_utils import ProcessorMixin
from transformers.utils import TensorType
from transformers.configuration_utils import PretrainedConfig
from transformers.onnx import OnnxConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"openai/clip-vit-base-patch32": "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/config.json",
# See all CLIP models at https://huggingface.co/models?filter=clip
}
class CLIPTextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the text encoder of the CLIP
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 49408):
Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
the `inputs_ids` passed when calling [`CLIPModel`].
hidden_size (`int`, *optional*, defaults to 512):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer encoder.
max_position_embeddings (`int`, *optional*, defaults to 77):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
Example:
```python
>>> from transformers import CLIPTextConfig, CLIPTextModel
>>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
>>> configuration = CLIPTextConfig()
>>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
>>> model = CLIPTextModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "clip_text_model"
def __init__(
self,
vocab_size=49408,
hidden_size=512,
intermediate_size=2048,
projection_dim=512,
num_hidden_layers=12,
num_attention_heads=8,
max_position_embeddings=77,
hidden_act="quick_gelu",
layer_norm_eps=1e-6,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.max_position_embeddings = max_position_embeddings
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# get the text config dict if we are loading from CLIPConfig
if config_dict.get("model_type") == "clip":
config_dict = config_dict["text_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class CLIPVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 32):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
Example:
```python
>>> from transformers import CLIPVisionConfig, CLIPVisionModel
>>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
>>> configuration = CLIPVisionConfig()
>>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
>>> model = CLIPVisionModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "clip_vision_model"
def __init__(
self,
hidden_size=768,
intermediate_size=3072,
projection_dim=512,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
image_size=224,
patch_size=32,
hidden_act="quick_gelu",
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# get the vision config dict if we are loading from CLIPConfig
if config_dict.get("model_type") == "clip":
config_dict = config_dict["vision_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class CLIPConfig(PretrainedConfig):
r"""
[`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
a configuration with the defaults will yield a similar configuration to that of the CLIP
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`CLIPTextConfig`].
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
projection_dim (`int`, *optional*, defaults to 512):
Dimentionality of text and vision projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
kwargs (*optional*):
Dictionary of keyword arguments.
Example:
```python
>>> from transformers import CLIPConfig, CLIPModel
>>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
>>> configuration = CLIPConfig()
>>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
>>> model = CLIPModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
>>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
>>> from transformers import CLIPTextConfig, CLIPVisionConfig
>>> # Initializing a CLIPText and CLIPVision configuration
>>> config_text = CLIPTextConfig()
>>> config_vision = CLIPVisionConfig()
>>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
```"""
model_type = "clip"
is_composition = True
def __init__(
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
):
# If `_config_dict` exist, we use them for the backward compatibility.
# We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
# of confusion!).
text_config_dict = kwargs.pop("text_config_dict", None)
vision_config_dict = kwargs.pop("vision_config_dict", None)
super().__init__(**kwargs)
# Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
# `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
# cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
if text_config_dict is not None:
if text_config is None:
text_config = {}
# This is the complete result when using `text_config_dict`.
_text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
# Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
for key, value in _text_config_dict.items():
if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
# If specified in `text_config_dict`
if key in text_config_dict:
message = (
f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
f'The value `text_config_dict["{key}"]` will be used instead.'
)
# If inferred from default argument values (just to be super careful)
else:
message = (
f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
f'value `text_config["{key}"]` will be overriden.'
)
logger.warning(message)
# Update all values in `text_config` with the ones in `_text_config_dict`.
text_config.update(_text_config_dict)
if vision_config_dict is not None:
if vision_config is None:
vision_config = {}
# This is the complete result when using `vision_config_dict`.
_vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
# convert keys to string instead of integer
if "id2label" in _vision_config_dict:
_vision_config_dict["id2label"] = {
str(key): value for key, value in _vision_config_dict["id2label"].items()
}
# Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
for key, value in _vision_config_dict.items():
if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
# If specified in `vision_config_dict`
if key in vision_config_dict:
message = (
f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
f'values. The value `vision_config_dict["{key}"]` will be used instead.'
)
# If inferred from default argument values (just to be super careful)
else:
message = (
f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
f'The value `vision_config["{key}"]` will be overriden.'
)
logger.warning(message)
# Update all values in `vision_config` with the ones in `_vision_config_dict`.
vision_config.update(_vision_config_dict)
if text_config is None:
text_config = {}
logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
if vision_config is None:
vision_config = {}
logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
self.text_config = CLIPTextConfig(**text_config)
self.vision_config = CLIPVisionConfig(**vision_config)
self.projection_dim = projection_dim
self.logit_scale_init_value = logit_scale_init_value
self.initializer_factor = 1.0
@classmethod
def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
r"""
Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
configuration.
Returns:
[`CLIPConfig`]: An instance of a configuration object
"""
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
Returns:
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output["text_config"] = self.text_config.to_dict()
output["vision_config"] = self.vision_config.to_dict()
output["model_type"] = self.__class__.model_type
return output
class CLIPOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("input_ids", {0: "batch", 1: "sequence"}),
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
("attention_mask", {0: "batch", 1: "sequence"}),
]
)
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("logits_per_image", {0: "batch"}),
("logits_per_text", {0: "batch"}),
("text_embeds", {0: "batch"}),
("image_embeds", {0: "batch"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-4
def generate_dummy_inputs(
self,
processor: "ProcessorMixin",
batch_size: int = -1,
seq_length: int = -1,
framework: Optional["TensorType"] = None,
) -> Mapping[str, Any]:
text_input_dict = super().generate_dummy_inputs(
processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
)
image_input_dict = super().generate_dummy_inputs(
processor.feature_extractor, batch_size=batch_size, framework=framework
)
return {**text_input_dict, **image_input_dict}
@property
def default_onnx_opset(self) -> int:
return 14
File diff suppressed because it is too large Load Diff
+44
View File
@@ -0,0 +1,44 @@
import argparse
import json
import torch
from transformers.models.llama.configuration_llama import LlamaConfig
from mplug_owl.configuration_mplug_owl import mPLUG_OwlConfig
from mplug_owl.modeling_mplug_owl import mPLUG_OwlForConditionalGeneration
from transformers.models.llama.tokenization_llama import LlamaTokenizer
from mplug_owl.modeling_mplug_owl import ImageProcessor
from mplug_owl.tokenize_utils import tokenize_prompts
class mPLUG:
def __init__(self, checkpoint_path=None, tokenizer_path=None) -> None:
config = mPLUG_OwlConfig()
self.model = mPLUG_OwlForConditionalGeneration(config=config).to(torch.bfloat16)
self.model.eval()
if checkpoint_path is not None:
tmp_ckpt = torch.load(
checkpoint_path, map_location='cpu')
msg = self.model.load_state_dict(tmp_ckpt, strict=False)
print(msg)
assert tokenizer_path is not None
self.tokenizer = LlamaTokenizer(
tokenizer_path, pad_token='<unk>', add_bos_token=False)
self.img_processor = ImageProcessor()
def generate(self, image, question, max_length=512, top_k=1, do_sample=True, **generate_kwargs):
prompts = [
f'''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
Human: <image>
Human: {question}
AI: ''']
tokens_to_generate = 0
add_BOS = True
context_tokens_tensor, context_length_tensorm, attention_mask = tokenize_prompts(
prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS, tokenizer=self.tokenizer, ignore_dist=True)
images = self.img_processor(image).to(torch.bfloat16).cuda()
context_tokens_tensor = context_tokens_tensor.cuda()
self.model.eval()
with torch.no_grad():
res = self.model.generate(input_ids=context_tokens_tensor, pixel_values=images,
attention_mask=attention_mask, max_lengt=max_length,top_k=top_k,do_sample=do_sample,**generate_kwargs)
sentence = self.tokenizer.decode(res.tolist()[0], skip_special_tokens=True)
return sentence
@@ -0,0 +1,154 @@
# coding=utf-8
# Copyright 2023 Alibaba Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import os
from typing import Union
from transformers.configuration_utils import PretrainedConfig
from transformers.models.auto import CONFIG_MAPPING
from transformers.models.auto.modeling_auto import \
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
from transformers.utils import logging
logger = logging.get_logger(__name__)
class mPLUG_OwlVisualAbstractorConfig(PretrainedConfig):
model_type = "mPLUG_OwlVisualAbstractor"
def __init__(
self,
vocab_size=30522,
hidden_size=1024,
num_hidden_layers=6,
num_attention_heads=8,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
initializer_range=0.02,
layer_norm_eps=1e-5,
pad_token_id=0,
position_embedding_type="absolute",
classifier_dropout=None,
cross_attention_frequency=2,
encoder_hidden_size=1024,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.classifier_dropout = classifier_dropout
self.cross_attention_frequency = cross_attention_frequency
self.encoder_hidden_size = encoder_hidden_size
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
config_dict, kwargs = cls.get_config_dict(
pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "mplug_owl":
config_dict = config_dict["abstractor_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class mPLUG_OwlConfig(PretrainedConfig):
model_type = "mplug_owl"
is_composition = True
def __init__(self, vision_config=None, visual_abstractor_config=None, text_config=None, num_query_tokens=64, **kwargs):
super().__init__(**kwargs)
from clip.configuration_clip import CLIPVisionConfig
if vision_config is None:
# By defalt we use openai-clip large patch14
vision_config = CLIPVisionConfig(
**vision_config_dict, layer_norm_eps=1e-6).to_dict()
logger.info(
"vision_config is None.")
if visual_abstractor_config is None:
visual_abstractor_config = {}
logger.info(
"abstractor_config is None. ")
if text_config is None:
# we use LLAMA 7b by default
from transformers.models.llama.configuration_llama import \
LlamaConfig
text_config = LlamaConfig(pad_token_id=2).to_dict()
logger.info("text_config is None.")
self.vision_config = CLIPVisionConfig(**vision_config)
self.visual_abstractor_config = mPLUG_OwlVisualAbstractorConfig(
**visual_abstractor_config)
self.visual_abstractor_config.layer_norm_eps = 1e-6
text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
self.tie_word_embeddings = self.text_config.tie_word_embeddings
self.is_encoder_decoder = self.text_config.is_encoder_decoder
self.num_query_tokens = num_query_tokens
self.visual_abstractor_config.encoder_hidden_size = self.vision_config.hidden_size
self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
self.initializer_factor = 1.0
self.initializer_range = 0.02
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
Returns:
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output["vision_config"] = self.vision_config.to_dict()
output["abstractor_config"] = self.visual_abstractor_config.to_dict()
output["text_config"] = self.text_config.to_dict()
output["model_type"] = self.__class__.model_type
return output
vision_config_dict = {
"hidden_size": 1024,
"intermediate_size": 4096,
"num_attention_heads": 8,
"num_hidden_layers": 24,
"patch_size": 14,
"projection_dim": 768}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,171 @@
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization utilities."""
import re
import torch
from icecream import ic
def detokenize_generations(tokens_gpu_tensor,
lengths_gpu_tensor,
return_segments, tokenizer):
"""Detokenize the generated tokens."""
prompts_plus_generations = []
if return_segments:
prompts_plus_generations_segments = []
tokens = tokens_gpu_tensor.cpu().numpy().tolist()
lengths = lengths_gpu_tensor.cpu().numpy().tolist()
for sequence_tokens, length in zip(tokens, lengths):
sequence_tokens = sequence_tokens[:length]
prompts_plus_generations.append(
tokenizer.detokenize(sequence_tokens))
if return_segments:
from tokenizers.decoders import Metaspace
if hasattr(tokenizer, 'tokenizer'):
if isinstance(tokenizer.tokenizer.decoder, Metaspace):
words = tokenizer.tokenizer.decode(sequence_tokens)
else:
words = []
for token in sequence_tokens:
word = tokenizer.tokenizer.decoder[token]
word = bytearray(
[tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
'utf-8', errors='replace')
words.append(word)
prompts_plus_generations_segments.append(words)
else:
words = tokenizer.detokenize(sequence_tokens)
# else:
# words = []
# for token in sequence_tokens:
# word = tokenizer.tokenizer.decoder[token]
# word = bytearray(
# [tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
# 'utf-8', errors='replace')
# words.append(word)
prompts_plus_generations_segments.append(words)
if return_segments:
return tokens, prompts_plus_generations, \
prompts_plus_generations_segments
return tokens, prompts_plus_generations
def tokenize_prompts(prompts=None, tokens_to_generate=None,
add_BOS=None, rank=0, tokenizer=None, ignore_dist=False):
"""Tokenize prompts and make them avaiable on all ranks."""
# On all ranks set to None so we can pass them to functions
sizes_list = None
prompts_tokens_cuda_long_tensor = None
prompts_length_cuda_long_tensor = None
# On the specified rank, build the above.
attention_mask = None
if ignore_dist or torch.distributed.get_rank() == rank:
assert prompts is not None
assert tokens_to_generate is not None
# Tensor of tokens padded and their unpadded length.
prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor, attention_mask = \
_tokenize_prompts_and_batch(
prompts, tokens_to_generate, add_BOS, tokenizer)
# We need the sizes of these tensors for the boradcast
sizes_list = [prompts_tokens_cuda_long_tensor.size(0), # Batch size
prompts_tokens_cuda_long_tensor.size(1)] # Sequence lenght
return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor, attention_mask
def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS, tokenizer):
"""Given a set of prompts and number of tokens to generate:
- tokenize prompts
- set the sequence length to be the max of length of prompts
plus the number of tokens we would like to generate
- pad all the sequences to this length so we can convert them
into a 2D tensor.
"""
# Tokenize all the prompts.
# if add_BOS:
# prompts_tokens = [[tokenizer.bos] + tokenizer.tokenize(prompt)
# for prompt in prompts]
# else:
# prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts]
prompts_tokens = [_tokenize_prompt(
prompt, tokenizer, add_BOS) for prompt in prompts]
# Now we have a list of list of tokens which each list has a different
# size. We want to extend this list to:
# - incorporate the tokens that need to be generated
# - make all the sequences equal length.
# Get the prompts length.
prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens]
# Get the max prompts length.
max_prompt_len = max(prompts_length)
# Number of tokens in the each sample of the batch.
samples_length = max_prompt_len + tokens_to_generate
# Now update the list of list to be of the same size: samples_length.
for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length):
padding_size = samples_length - prompt_length
prompt_tokens.extend([tokenizer.eos_token_id] * padding_size)
# Now we are in a structured format, we can convert to tensors.
prompts_tokens_tensor = torch.LongTensor(prompts_tokens)
prompts_length_tensor = torch.LongTensor(prompts_length)
attention_mask = torch.zeros(prompts_tokens_tensor.shape[:2])
for i, l in enumerate(prompts_length_tensor):
attention_mask[i, :l] = 1
return prompts_tokens_tensor, prompts_length_tensor, attention_mask
def _tokenize_prompt(prompt, tokenizer, add_BOS=False, media_info={'<image>': 65}):
media_tokens = {k: -int(i+1) for i, k in enumerate(media_info.keys())}
media_lengths = media_info.copy()
if add_BOS:
prompt_chunk = [tokenizer.bos_token_id]
else:
prompt_chunk = []
# Pure Text
if all([media_token not in prompt for media_token in media_tokens.keys()]):
enc_chunk = prompt_chunk + \
tokenizer(prompt, add_special_tokens=False)['input_ids']
# Multi-Modal Text
else:
enc_chunk = prompt_chunk
pattern = '|'.join(map(re.escape, list(media_tokens.keys())))
chunk_strs = re.split(f'({pattern})', prompt)
chunk_strs = [x for x in chunk_strs if len(x) > 0]
for idx, chunk_str in enumerate(chunk_strs):
if chunk_str in media_tokens:
enc_chunk += [media_tokens[chunk_str]] * \
media_lengths[chunk_str]
else:
tmp_chunk = tokenizer(chunk_str, add_special_tokens=False)[
'input_ids']
# if idx < len(chunk_strs) - 1: # Last chunk should not have eos
# tmp_chunk += [tokenizer.eod_id]
enc_chunk += tmp_chunk
return enc_chunk
+29
View File
@@ -0,0 +1,29 @@
import PIL
def pad_image(image, target_size):
"""
:param image: input image
:param target_size: a tuple (num,num)
:return: new image
"""
iw, ih = image.size # 原始图像的尺寸
w, h = target_size # 目标图像的尺寸
scale = min(w / iw, h / ih) # 转换的最小比例
# 保证长或宽,至少一个符合目标图像的尺寸 0.5保证四舍五入
nw = int(iw * scale+0.5)
nh = int(ih * scale+0.5)
w += 128
h += 128
image = image.resize((nw, nh), PIL.Image.BICUBIC) # 更改图像尺寸,双立法插值效果很好
#image.show()
new_image = PIL.Image.new('RGB', (w, h), (0, 0, 0)) # 生成黑色图像
# // 为整数除法,计算图像的位置
new_image.paste(image, ((w - nw) // 2, (h - nh) // 2)) # 将图像填充为中间图像,两侧为黑色的样式
return new_image