Files
MultimodalOCR/models/OpenFlamingo/OpenFlamingo.py
echo840 6e02bedd46 update
2023-05-27 17:21:39 +08:00

53 lines
2.2 KiB
Python

from open_flamingo import create_model_and_transforms
from huggingface_hub import hf_hub_download
import torch
from ..process import pad_image, resize_image
from PIL import Image
import re
def postprocess_vqa_generation(predictions):
return re.split("Question|Answer", predictions, 1)[0]
class OpenFlamingo:
def __init__(self, llama_path, check_point, device) -> None:
model, image_processor, tokenizer = create_model_and_transforms(
clip_vision_encoder_path="ViT-L-14",
clip_vision_encoder_pretrained="openai",
lang_encoder_path = llama_path,
tokenizer_path = llama_path,
cross_attn_every_n_layers=4
)
checkpoint = torch.load(check_point, map_location="cpu")
model.load_state_dict(checkpoint, strict=False)
#checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B", "checkpoint.pt")
self.model = model.to(device)
self.image_processor=image_processor
self.tokenizer = tokenizer
self.device = device
def generate(self, image, question, name="resize"):
self.tokenizer.padding_side = "left"
lang_x = self.tokenizer(
[f"<image>Question:{question} Answer:"],
return_tensors="pt",
).to(self.device)
len_input = len(lang_x['input_ids'][0])
image = Image.open(image)
if name == "resize":
image = resize_image(image, (224,224))
vision_x = [self.image_processor(image).unsqueeze(0)]
vision_x = torch.cat(vision_x, dim=0)
vision_x = vision_x.unsqueeze(1).unsqueeze(0).to(self.device)
generated_text = self.model.generate(
vision_x=vision_x,
lang_x=lang_x["input_ids"],
attention_mask=lang_x["attention_mask"],
max_new_tokens=48,
num_beams=3,
)
answer = self.tokenizer.decode(generated_text[0][len_input:], skip_special_tokens=True)
'''process_function = (
postprocess_vqa_generation)
new_predictions = [
process_function(out)
for out in self.tokenizer.batch_decode(generated_text, skip_special_tokens=True)
]'''
return answer