update

2023-05-27 17:21:39 +08:00
parent b388fba03e
commit 6e02bedd46
450 changed files with 1148092 additions and 38254 deletions
--- a/models/OpenFlamingo/OpenFlamingo.py
+++ b/models/OpenFlamingo/OpenFlamingo.py
@@ -0,0 +1,53 @@
+from open_flamingo import create_model_and_transforms
+from huggingface_hub import hf_hub_download
+import torch
+from ..process import pad_image, resize_image
+from PIL import Image
+import re
+def postprocess_vqa_generation(predictions):
+    return re.split("Question|Answer", predictions, 1)[0]
+class OpenFlamingo:
+    def __init__(self, llama_path, check_point, device) -> None:
+        model, image_processor, tokenizer = create_model_and_transforms(
+            clip_vision_encoder_path="ViT-L-14",
+            clip_vision_encoder_pretrained="openai",
+            lang_encoder_path = llama_path,
+            tokenizer_path = llama_path,
+            cross_attn_every_n_layers=4
+        )
+        checkpoint = torch.load(check_point, map_location="cpu")
+        model.load_state_dict(checkpoint, strict=False)
+        #checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B", "checkpoint.pt")
+        self.model = model.to(device)
+        self.image_processor=image_processor
+        self.tokenizer = tokenizer
+        self.device = device
+    def generate(self, image, question, name="resize"):
+        self.tokenizer.padding_side = "left"
+        lang_x = self.tokenizer(
+        [f"<image>Question:{question} Answer:"],
+        return_tensors="pt",
+        ).to(self.device)
+        len_input =  len(lang_x['input_ids'][0])
+        image = Image.open(image)
+        if name == "resize":
+            image = resize_image(image, (224,224))
+        vision_x = [self.image_processor(image).unsqueeze(0)]
+        vision_x = torch.cat(vision_x, dim=0)
+        vision_x = vision_x.unsqueeze(1).unsqueeze(0).to(self.device)
+        generated_text = self.model.generate(
+        vision_x=vision_x,
+        lang_x=lang_x["input_ids"],
+        attention_mask=lang_x["attention_mask"],
+        max_new_tokens=48,
+        num_beams=3,
+        )
+        answer = self.tokenizer.decode(generated_text[0][len_input:], skip_special_tokens=True)
+        '''process_function = (
+            postprocess_vqa_generation)
+        new_predictions = [
+            process_function(out)
+            for out in self.tokenizer.batch_decode(generated_text, skip_special_tokens=True)
+        ]'''
+        return answer
+