MultimodalOCR/models/OpenFlamingo/OpenFlamingo.py

from open_flamingo import create_model_and_transforms
from huggingface_hub import hf_hub_download
import torch
from ..process import pad_image, resize_image
from PIL import Image
import re
def postprocess_vqa_generation(predictions):
    return re.split("Question|Answer", predictions, 1)[0]
class OpenFlamingo:
    def __init__(self, llama_path, check_point, device) -> None:
        model, image_processor, tokenizer = create_model_and_transforms(
            clip_vision_encoder_path="ViT-L-14",
            clip_vision_encoder_pretrained="openai",
            lang_encoder_path = llama_path,
            tokenizer_path = llama_path,
            cross_attn_every_n_layers=4
        )
        checkpoint = torch.load(check_point, map_location="cpu")
        model.load_state_dict(checkpoint, strict=False)
        #checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B", "checkpoint.pt")
        self.model = model.to(device)
        self.image_processor=image_processor
        self.tokenizer = tokenizer
        self.device = device
    def generate(self, image, question, name="resize"):
        self.tokenizer.padding_side = "left"
        lang_x = self.tokenizer(
        [f"<image>Question:{question} Answer:"],
        return_tensors="pt",
        ).to(self.device)
        len_input =  len(lang_x['input_ids'][0])
        image = Image.open(image)
        if name == "resize":
            image = resize_image(image, (224,224))
        vision_x = [self.image_processor(image).unsqueeze(0)]
        vision_x = torch.cat(vision_x, dim=0)
        vision_x = vision_x.unsqueeze(1).unsqueeze(0).to(self.device)
        generated_text = self.model.generate(
        vision_x=vision_x,
        lang_x=lang_x["input_ids"],
        attention_mask=lang_x["attention_mask"],
        max_new_tokens=48,
        num_beams=3,
        )
        answer = self.tokenizer.decode(generated_text[0][len_input:], skip_special_tokens=True)
        '''process_function = (
            postprocess_vqa_generation)
        new_predictions = [
            process_function(out)
            for out in self.tokenizer.batch_decode(generated_text, skip_special_tokens=True)
        ]'''
        return answer