add readme (#10)

* Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * Update Readme.md * remove submodule * add mPLUG MiniGPT4 * Update Readme.md * Update Readme.md * Update Readme.md --------- Co-authored-by: Yuliang Liu <34134635+Yuliang-Liu@users.noreply.github.com>
2023-06-01 09:57:03 +08:00
parent 64f7eb334d
commit 3213a65d96
275 changed files with 16059 additions and 6 deletions
--- a/models/mPLUG_Owl/pipeline/utils.py
+++ b/models/mPLUG_Owl/pipeline/utils.py
@@ -0,0 +1,156 @@
+import math
+import random
+import torch
+import numpy as np
+from icecream import ic
+
+def print_rank_0(message):
+    """If distributed is initialized, print only on rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
+
+ARGS = None
+def set_args(args):
+    global ARGS
+    ARGS = args
+
+def get_args():
+    return ARGS
+
+TOKENIZER = None
+def set_tokenizer(tokenizer):
+    global TOKENIZER
+    TOKENIZER = tokenizer
+
+def get_tokenizer():
+    return TOKENIZER
+from torch import distributed as dist
+
+class worker_init:
+    def __init__(self, epoch_id):
+        self.epoch_id = epoch_id
+    def _worker_init_fn(self, worker_id):
+        random.seed(worker_id + self.epoch_id*1e4 + dist.get_rank()*1e8)
+
+
+def batchify(batch):
+    # collate_fn
+    # image = torch.cat([data["image"] for data in batch], dim=0)
+    image = [data["image"] if data["image"] is not None else None for data in batch]
+    if all([img is None for img in image]):
+        image = None
+    else:
+        image = torch.cat([img for img in image if img is not None], dim=0)
+    num_images_per_sample = torch.LongTensor([data["image"].size(0) if data['image'] is not None else 0 for data in batch])
+
+    text = torch.stack([torch.LongTensor(data["text"]['input_ids']) for data in batch], dim=0)
+    non_padding_mask = torch.stack([torch.LongTensor(data["text"]['non_padding_mask']) for data in batch], dim=0)
+    non_media_mask = torch.stack([torch.LongTensor(data["text"]['non_media_mask']) for data in batch], dim=0)
+    prompt_mask = torch.stack([torch.LongTensor(data["text"]['prompt_mask']) for data in batch], dim=0)
+    prompt_length = torch.from_numpy(np.stack([data["text"]["prompt_length"] for data in batch]))
+    seq_length = torch.from_numpy(np.stack([data["text"]["seq_length"] for data in batch]))
+    
+    output_batch = {
+        "pixel_values": image,
+        "input_ids": text.long(),
+        "labels": text.long().clone(),
+        "num_images": num_images_per_sample.long(),
+        "non_padding_mask": non_padding_mask.long(),
+        "non_media_mask": non_media_mask.long(),
+        "prompt_mask": prompt_mask.long()        
+    }
+    return output_batch
+
+
+def get_param_groups(modules,
+                     no_weight_decay_cond,
+                     scale_lr_cond,
+                     lr_mult):
+    """creates param groups based on weight decay condition (regularized vs non regularized)
+       and learning rate scale condition (args.lr vs lr_mult * args.lr)
+       scale_lr_cond is used during finetuning where head of the network requires a scaled
+       version of the base learning rate. 
+    """
+    wd_no_scale_lr = []
+    wd_scale_lr = []
+    no_wd_no_scale_lr = []
+    no_wd_scale_lr = []
+    for module in modules:
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue
+
+            if no_weight_decay_cond is not None:
+                no_wd = no_weight_decay_cond(name, param)
+            else:
+                # do not regularize biases nor Norm parameters
+                no_wd = name.endswith(".bias") or len(param.shape) == 1
+
+            if scale_lr_cond is not None:
+                scale_lr = scale_lr_cond(name, param)
+            else:
+                scale_lr = False
+
+            if not no_wd and not scale_lr:
+                wd_no_scale_lr.append(param)
+            elif not no_wd and scale_lr:
+                wd_scale_lr.append(param)
+            elif no_wd and not scale_lr:
+                no_wd_no_scale_lr.append(param)
+            else:
+                no_wd_scale_lr.append(param)
+
+    param_groups = []
+    if len(wd_no_scale_lr):
+        param_groups.append(
+            {'params': wd_no_scale_lr, 'wd_mult': 1.0, 'lr_mult': 1.0})
+    if len(wd_scale_lr):
+        param_groups.append(
+            {'params': wd_scale_lr, 'wd_mult': 1.0, 'lr_mult': lr_mult})
+    if len(no_wd_no_scale_lr):
+        param_groups.append({'params': no_wd_no_scale_lr,
+                            'wd_mult': 0.0, 'lr_mult': 1.0})
+    if len(no_wd_scale_lr):
+        param_groups.append(
+            {'params': no_wd_scale_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult})
+
+    return param_groups
+
+def get_cosine_schedule_with_warmup(
+        optimizer, lr, min_lr, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
+    ):
+        """
+        Create a schedule with a learning rate that decreases following the values of the cosine function between the
+        initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+        initial lr set in the optimizer.
+
+        Args:
+            optimizer ([`~torch.optim.Optimizer`]):
+                The optimizer for which to schedule the learning rate.
+            num_warmup_steps (`int`):
+                The number of steps for the warmup phase.
+            num_training_steps (`int`):
+                The total number of training steps.
+            num_cycles (`float`, *optional*, defaults to 0.5):
+                The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+                following a half-cosine).
+            last_epoch (`int`, *optional*, defaults to -1):
+                The index of the last epoch when resuming training.
+
+        Return:
+            `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+        """
+
+        delta_min_lr = (lr-min_lr)/lr  # 0.95
+
+        def lr_lambda(current_step):
+            if current_step < num_warmup_steps:
+                return (1-delta_min_lr) + delta_min_lr * float(current_step) / float(max(1, num_warmup_steps))
+            progress = float(current_step - num_warmup_steps) / \
+                float(max(1, num_training_steps - num_warmup_steps))
+            return delta_min_lr + (1-delta_min_lr) * max(0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+        from torch.optim.lr_scheduler import LambdaLR
+        return LambdaLR(optimizer, lr_lambda, last_epoch)