MultimodalOCR/models/mPLUG_Owl/pipeline/mPLUG.py

import torch
import numpy as np
import requests
from PIL import Image, ImageOps
from mplug_owl.modeling_mplug_owl import MplugOwlForConditionalGeneration
from mplug_owl.tokenization_mplug_owl import MplugOwlTokenizer
from mplug_owl.processing_mplug_owl import MplugOwlImageProcessor, MplugOwlProcessor
def resize_image(image, target_size):
    width, height = image.size
    aspect_ratio = width / height
    if aspect_ratio > 1:
        # 宽度大于高度，以宽度为基准进行 resize
        new_width = target_size[0]
        new_height = int(new_width / aspect_ratio)
    else:
        # 高度大于宽度，以高度为基准进行 resize
        new_height = target_size[1]
        new_width = int(new_height * aspect_ratio)
    image = image.resize((new_width, new_height))
    width_diff = target_size[0] - image.size[0]
    height_diff = target_size[1] - image.size[1]
    left_padding = 0
    top_padding = 0
    right_padding = width_diff - left_padding
    bottom_padding = height_diff - top_padding
    padded_image = ImageOps.expand(image, border=(left_padding, top_padding, right_padding, bottom_padding), fill=0)
    return padded_image


def get_model(pretrained_ckpt, use_bf16=False):
    """Model Provider with tokenizer and processor. 

    Args:
        pretrained_ckpt (string): The path to pre-trained checkpoint.
        use_bf16 (bool, optional): Whether to use bfloat16 to load the model. Defaults to False.

    Returns:
        model: MplugOwl Model
        tokenizer: MplugOwl text tokenizer
        processor: MplugOwl processor (including text and image)
    """
    model = MplugOwlForConditionalGeneration.from_pretrained(
        pretrained_ckpt,
        torch_dtype=torch.bfloat16 if use_bf16 else torch.half,
    )
    image_processor = MplugOwlImageProcessor.from_pretrained(pretrained_ckpt)
    tokenizer = MplugOwlTokenizer.from_pretrained(pretrained_ckpt)
    processor = MplugOwlProcessor(image_processor, tokenizer)
    return model, tokenizer, processor


def do_generate(prompts, image_list, model, tokenizer, processor, use_bf16=False, **generate_kwargs):
    """The interface for generation

    Args:
        prompts (List[str]): The prompt text
        image_list (List[str]): Paths of images
        model (MplugOwlForConditionalGeneration): MplugOwlForConditionalGeneration
        tokenizer (MplugOwlTokenizer): MplugOwlTokenizer
        processor (MplugOwlProcessor): MplugOwlProcessor
        use_bf16 (bool, optional): Whether to use bfloat16. Defaults to False.

    Returns:
        sentence (str): Generated sentence.
    """
    inputs = processor(text=prompts, images=image_list, return_tensors='pt')
    inputs = {k: v.bfloat16() if v.dtype == torch.float else v for k, v in inputs.items()}
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        res = model.generate(**inputs, **generate_kwargs)
    sentence = tokenizer.decode(res.tolist()[0], skip_special_tokens=True)
    return sentence
class mPLUG:
    def __init__(self, base_model, device) -> None:
        model, tokenizer, processor = get_model(base_model, use_bf16=True)
        self.model = model.to(device)
        self.tokenizer = tokenizer
        self.processor = processor
    def generate(self, image, question,name='resize'):
        prompts = [f'''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
Human: <image>
Human: {question}
AI: ''']
        image = Image.open(image)
        #ct80 none 0.3229166666666667 resize  0.8159722222222222
        if name == 'resize':
            image = resize_image(image,(224,224))
        image_list=[image]
        sentence = do_generate(
        prompts, image_list, self.model, 
        self.tokenizer, self.processor, use_bf16=True,
        max_length=512, top_k=1, do_sample=True)
        return sentence
add readme (#10) * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * remove submodule * add mPLUG MiniGPT4 * Update Readme.md * Update Readme.md * Update Readme.md --------- Co-authored-by: Yuliang Liu <34134635+Yuliang-Liu@users.noreply.github.com> 2023-06-01 09:57:03 +08:00			`import torch`
			`import numpy as np`
			`import requests`
			`from PIL import Image, ImageOps`
			`from mplug_owl.modeling_mplug_owl import MplugOwlForConditionalGeneration`
			`from mplug_owl.tokenization_mplug_owl import MplugOwlTokenizer`
			`from mplug_owl.processing_mplug_owl import MplugOwlImageProcessor, MplugOwlProcessor`
			`def resize_image(image, target_size):`
			`width, height = image.size`
			`aspect_ratio = width / height`
			`if aspect_ratio > 1:`
			`# 宽度大于高度，以宽度为基准进行 resize`
			`new_width = target_size[0]`
			`new_height = int(new_width / aspect_ratio)`
			`else:`
			`# 高度大于宽度，以高度为基准进行 resize`
			`new_height = target_size[1]`
			`new_width = int(new_height * aspect_ratio)`
			`image = image.resize((new_width, new_height))`
			`width_diff = target_size[0] - image.size[0]`
			`height_diff = target_size[1] - image.size[1]`
			`left_padding = 0`
			`top_padding = 0`
			`right_padding = width_diff - left_padding`
			`bottom_padding = height_diff - top_padding`
			`padded_image = ImageOps.expand(image, border=(left_padding, top_padding, right_padding, bottom_padding), fill=0)`
			`return padded_image`


			`def get_model(pretrained_ckpt, use_bf16=False):`
			`"""Model Provider with tokenizer and processor.`

			`Args:`
			`pretrained_ckpt (string): The path to pre-trained checkpoint.`
			`use_bf16 (bool, optional): Whether to use bfloat16 to load the model. Defaults to False.`

			`Returns:`
			`model: MplugOwl Model`
			`tokenizer: MplugOwl text tokenizer`
			`processor: MplugOwl processor (including text and image)`
			`"""`
			`model = MplugOwlForConditionalGeneration.from_pretrained(`
			`pretrained_ckpt,`
			`torch_dtype=torch.bfloat16 if use_bf16 else torch.half,`
			`)`
			`image_processor = MplugOwlImageProcessor.from_pretrained(pretrained_ckpt)`
			`tokenizer = MplugOwlTokenizer.from_pretrained(pretrained_ckpt)`
			`processor = MplugOwlProcessor(image_processor, tokenizer)`
			`return model, tokenizer, processor`


			`def do_generate(prompts, image_list, model, tokenizer, processor, use_bf16=False, **generate_kwargs):`
			`"""The interface for generation`

			`Args:`
			`prompts (List[str]): The prompt text`
			`image_list (List[str]): Paths of images`
			`model (MplugOwlForConditionalGeneration): MplugOwlForConditionalGeneration`
			`tokenizer (MplugOwlTokenizer): MplugOwlTokenizer`
			`processor (MplugOwlProcessor): MplugOwlProcessor`
			`use_bf16 (bool, optional): Whether to use bfloat16. Defaults to False.`

			`Returns:`
			`sentence (str): Generated sentence.`
			`"""`
			`inputs = processor(text=prompts, images=image_list, return_tensors='pt')`
			`inputs = {k: v.bfloat16() if v.dtype == torch.float else v for k, v in inputs.items()}`
			`inputs = {k: v.to(model.device) for k, v in inputs.items()}`
			`with torch.no_grad():`
			`res = model.generate(inputs, generate_kwargs)`
			`sentence = tokenizer.decode(res.tolist()[0], skip_special_tokens=True)`
			`return sentence`
			`class mPLUG:`
			`def __init__(self, base_model, device) -> None:`
			`model, tokenizer, processor = get_model(base_model, use_bf16=True)`
			`self.model = model.to(device)`
			`self.tokenizer = tokenizer`
			`self.processor = processor`
			`def generate(self, image, question,name='resize'):`
			`prompts = [f'''The following is a conversation between a curious human and AI assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.`
			`Human: <image>`
			`Human: {question}`
			`AI: ''']`
			`image = Image.open(image)`
			`#ct80 none 0.3229166666666667 resize 0.8159722222222222`
			`if name == 'resize':`
			`image = resize_image(image,(224,224))`
			`image_list=[image]`
			`sentence = do_generate(`
			`prompts, image_list, self.model,`
			`self.tokenizer, self.processor, use_bf16=True,`
			`max_length=512, top_k=1, do_sample=True)`
			`return sentence`