diff --git a/.gitignore b/.gitignore
index 35cc9d2..a04c0d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,10 +17,16 @@
*.ipynb
*.pyc
*.log
-
+*.pth
!docs/
!docs/**/*.png
!docs/**/*.jpg
!docs/**/*.jpeg
!docs/**/*.gif
-!docs/**/*.svg
\ No newline at end of file
+!docs/**/*.svg
+/data/weights
+*.bin
+*.safetensors
+src/model/TextGen/weights/Usage Restriction Statement
+data/FUNSD/testing_data/.DS_Store
+data/FUNSD/training_data/.DS_Store
diff --git a/src/model/TextGen/README.md b/src/model/TextGen/README.md
new file mode 100644
index 0000000..5f2d446
--- /dev/null
+++ b/src/model/TextGen/README.md
@@ -0,0 +1,192 @@
+
TextCtrl: Diffusion-based Scene Text Editing with
+
+Prior Guidance Control [NeurIPS 2024]
+
+
+
+
+
+
+
+
+
+## TODOs
+- [x] Release ScenePair benchmark dataset and code of model;
+- [x] Release checkpoints and inference code;
+- [x] Release tranining pipeline;
+
+
+## 1 Installation
+### 1.1 Code Preparation
+```bash
+# Clone the repo
+$ git clone https://github.com/weichaozeng/TextCtrl.git
+$ cd TextCtrl/
+# Install required packages
+$ conda create --name textctrl python=3.8
+$ conda activate textctrl
+$ pip install torch==1.13.0+cu116 torchvision==0.14.0+cu116 torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cu116
+$ pip install -r requirement.txt
+```
+### 1.2 Checkpoints Preparation
+Download the checkpoints from [Link_1](https://drive.google.com/drive/folders/1OMgXXIXi-VN2hTlPywtdzIW5AJMIHzF0?usp=drive_link) and [Link_2](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5).The file structure should be set as follows:
+```bash
+TextCtrl/
+├── weights/
+│ ├── model.pth # weight of style encoder and unet
+│ ├── text_encoder.pth # weight of pretrained glyph encoder
+│ ├── style_encoder.pth # weight of pretrained style encoder
+│ ├── vision_model.pth # monitor weight
+│ ├── ocr_model.pth # ocr weight
+│ ├── vgg19.pth # vgg weight
+│ ├── vitstr_base_patch16_224.pth # vitstr weight
+│ └── sd/ # pretrained weight of stable-diffusion-v1-5
+│ ├── vae/
+│ ├── unet/
+│ └── scheduler/
+├── README.md
+├── ...
+```
+## 2 Inference
+### 2.1 Data Preparation
+The file structure of inference data should be set as the *example/*:
+```bash
+TextCtrl/
+├── example/
+│ ├── i_s/ # source cropped text images
+│ ├── i_s.txt # filename and text label of source images in i_s/
+│ └── i_t.txt # filename and text label of target images
+```
+
+### 2.2 Edit Arguments
+Edit the arguments in *inference.py*, especially:
+```bash
+parser.add_argument("--ckpt_path", type=str, default="weights/model.pth")
+parser.add_argument("--dataset_dir", type=str, default="example/")
+parser.add_argument("--output_dir", type=str, default="example_result/")
+```
+
+
+### 2.3 Generate Images
+The inference result could be found in *example_result/* after:
+```bash
+$ PYTHONPATH=.../TextCtrl/ python inference.py
+```
+
+### 2.4 Inference Results
+| Source Images | Target Text | Infer Results | Reference GT |
+| --- | --- | --- | --- |
+| | *"Private"* | | |
+| | *"First"* | | |
+| | *"RECORDS"* | | |
+| | *"Sunset"* | | |
+| | *"Network"* | | |
+
+
+
+## 3 Training
+### 3.1 Data Preparation
+The training relies on synthetic data generated by [SRNet-Datagen](https://github.com/youdao-ai/SRNet-Datagen) with some [modification](modify/) for required elements. The file structure should be set as follows:
+```bash
+Syn_data/
+├── fonts/
+│ ├── arial.ttf/
+│ └── .../
+├── train/
+│ ├── train-50k-1/
+│ ├── train-50k-2/
+│ ├── train-50k-3/
+│ └── train-50k-4/
+│ ├── i_s/
+│ ├── mask_s/
+│ ├── i_s.txt
+│ ├── t_f/
+│ ├── mask_t/
+│ ├── i_t.txt
+│ ├── t_t/
+│ ├── t_b/
+│ └── font.txt/
+└── eval/
+ └── eval-1k/
+
+```
+### 3.2 Text Style Pretraining
+```bash
+$ cd prestyle/
+# Modify the path of dir in the config file
+$ cd configs/
+$ vi StyleTrain.yaml
+# Start pretraining
+$ cd ..
+$ python train.py
+```
+
+### 3.3 Text Glyph Pretraining
+```bash
+$ cd preglyph/
+# Modify the path of dir in the config file
+$ cd configs/
+$ vi GlyphTrain.yaml
+# Start pretraining
+$ cd ..
+$ python pretrain.py
+```
+
+### 3.4 Prior Guided Training
+```bash
+$ cd TextCtrl/
+# Modify the path of dir in the config file
+$ cd configs/
+$ vi train.yaml
+# Start pretraining
+$ cd ..
+$ python train.py
+```
+
+
+
+## 4 Evaluation
+### 4.1 Data Preparation
+Download the ScenePair dataset from [Link](https://drive.google.com/file/d/1m_o2R2kFj_hDXJP5K21aC7lKs-eUky9s/view?usp=sharing) and unzip the files. The structure of each folder is as follows:
+```bash
+├── ScenePair/
+│ ├── i_s/ # source cropped text images
+│ ├── t_f/ # target cropped text images
+│ ├── i_full/ # full-size images
+│ ├── i_s.txt # filename and text label of images in i_s/
+│ ├── i_t.txt # filename and text label of images in t_f/
+│ ├── i_s_full.txt # filename, text label, corresponding full-size image name and location information of images in i_s/
+│ └── i_t_full.txt # filename, text label, corresponding full-size image name and location information of images in t_f/
+```
+### 4.2 Generate Images
+Before evaluation, corresponding edited images should be generated for a certain method based on the ScenePair dataset and should be saved in a *'.../result_folder/'* with the same filename. Result of some methods on ScenePair dataset are provided [here](https://drive.google.com/file/d/1343td96X7SuE0hYsMbTHALFmr1Md7SnQ/view?usp=drive_link).
+
+### 4.3 Style Fidelity
+SSIM, PSNR, MSE and FID are uesd to evaluate the style fidelity of edited result, with reference to [qqqyd/MOSTEL](https://github.com/qqqyd/MOSTEL).
+```bash
+$ cd evaluation/
+$ python evaluation.py --target_path .../result_folder/ --gt_path .../ScenePair/t_f/
+```
+
+### 4.4 Text Accuracy
+ACC and NED are used to evaluate the text accuracy of edited result, with the offical code and checkpoint in [clovaai/deep-text-recognition-benchmark](https://github.com/clovaai/deep-text-recognition-benchmark).
+
+## Related Resources
+Many thanks to these great projects [lksshw/SRNet](https://github.com/lksshw/SRNet)
+, [youdao-ai/SRNet-Datagen](https://github.com/youdao-ai/SRNet-Datagen)
+, [qqqyd/MOSTEL](https://github.com/qqqyd/MOSTEL)
+, [UCSB-NLP-Chang/DiffSTE](https://github.com/UCSB-NLP-Chang/DiffSTE)
+, [ZYM-PKU/UDiffText](https://github.com/ZYM-PKU/UDiffText)
+, [TencentARC/MasaCtrl](https://github.com/TencentARC/MasaCtrl)
+, [unilm/textdiffuser](https://github.com/microsoft/unilm/tree/master/textdiffuser)
+, [tyxsspa/AnyText](https://github.com/tyxsspa/AnyText).
+
+## Citation
+ @article{zeng2024textctrl,
+ title={TextCtrl: Diffusion-based scene text editing with prior guidance control},
+ author={Zeng, Weichao and Shu, Yan and Li, Zhenhang and Yang, Dongbao and Zhou, Yu},
+ journal={Advances in Neural Information Processing Systems},
+ volume={37},
+ pages={138569--138594},
+ year={2024}
+ }
diff --git a/src/model/TextGen/configs/inference.yaml b/src/model/TextGen/configs/inference.yaml
new file mode 100644
index 0000000..b85ef87
--- /dev/null
+++ b/src/model/TextGen/configs/inference.yaml
@@ -0,0 +1,76 @@
+model:
+ target: "src.trainer.CtrlBase.ControlBase"
+ params:
+ control_config:
+ target: "src.trainer.CtrlBase.StylePyramidNet"
+ params:
+ image_size: 256
+ patch_size: 16
+ in_channels: 3
+ embed_dim: 768
+ model_channels: 320
+ channel_mult: [ 1, 2, 4, 4 ]
+ pyramid_sizes: [ 32, 16, 8, 4]
+ use_checkpoint: True
+
+
+ base_config:
+ noise_scheduler: "diffusers.PNDMScheduler"
+ scheduler_config: "weights/sd/scheduler/scheduler_config.json"
+ num_inference_steps: 50
+ min_training_steps: 0
+
+ vae:
+ pretrained: "weights/sd/vae/"
+ normalizer: 0.18215
+
+ text_encoder_optimized: False
+ text_encoder:
+ target: "src.module.textencoder.modules.LabelEncoder"
+ params:
+ max_len: 24
+ emb_dim: 768
+ ckpt_path: "weights/text_encoder.pth"
+
+ unet_pretrained: "weights/sd/unet/diffusion_pytorch_model.bin"
+ unet:
+ target: "src.trainer.CtrlBase.ControlUNetModel"
+ params:
+ cross_attention_dim: 768
+
+ reconstruction_loss: True
+ ocr_loss_alpha: 0.01
+ cond_on_text_image: False
+ font_path: "weights/arial.ttf"
+
+ ocr_model:
+ height: 32
+ width: 128
+ ocr_supervised: False
+ pretrained: weights/ocr_model.pth
+ optimize: false
+ max_length: 25
+ charset_path: src/module/abinet/data/charset_36.txt
+ iter_size: 3
+ ensemble: ''
+ use_vision: False
+ vision:
+ checkpoint:
+ loss_weight: 1.
+ attention: 'position'
+ backbone: 'transformer'
+ backbone_ln: 3
+ max_length: 25
+ charset_path: src/module/abinet/data/charset_36.txt
+ language:
+ checkpoint:
+ num_layers: 4
+ loss_weight: 1.
+ detach: True
+ use_self_attn: False
+ max_length: 25
+ charset_path: src/module/abinet/data/charset_36.txt
+ alignment:
+ loss_weight: 1.
+ max_length: 25
+ charset_path: src/module/abinet/data/charset_36.txt
diff --git a/src/model/TextGen/configs/train.yaml b/src/model/TextGen/configs/train.yaml
new file mode 100644
index 0000000..963615b
--- /dev/null
+++ b/src/model/TextGen/configs/train.yaml
@@ -0,0 +1,114 @@
+data:
+ target: "src.dataset.textdata.WrappedDataModule"
+ batch_size: 8
+ train:
+ size: 256
+ root_dir: ".../Syn_data/train/"
+ font_dir: ".../Syn_data/fonts/"
+
+ validation:
+ size: 256
+ root_dir: ".../Syn_data/eval/"
+ font_dir: ".../Syn_data/fonts/"
+
+
+
+model:
+ target: "src.trainer.CtrlBase.ControlBase"
+ params:
+ control_config:
+ target: "src.trainer.CtrlBase.StylePyramidNet"
+ params:
+ image_size: 256
+ patch_size: 16
+ in_channels: 3
+ embed_dim: 768
+ model_channels: 320
+ channel_mult: [ 1, 2, 4, 4 ]
+ pyramid_sizes: [ 32, 16, 8, 4]
+ use_checkpoint: True
+
+
+ base_config:
+ noise_scheduler: "diffusers.PNDMScheduler"
+ scheduler_config: ".../weights/sd/scheduler/scheduler_config.json"
+ num_inference_steps: 50
+ min_training_steps: 0
+
+ vae:
+ pretrained: ".../weights/sd/vae/"
+ normalizer: 0.18215
+
+ text_encoder_optimized: False
+ text_encoder:
+ target: "src.module.textencoder.modules.LabelEncoder"
+ params:
+ max_len: 24
+ emb_dim: 768
+ ckpt_path: ".../weights/text_encoder.pth"
+
+ unet_pretrained: ".../weights/sd/unet/diffusion_pytorch_model.bin"
+ unet:
+ target: "src.trainer.CtrlBase.ControlUNetModel"
+ params:
+ cross_attention_dim: 768
+
+ reconstruction_loss: True
+ ocr_loss_alpha: 0.01
+ cond_on_text_image: False
+ font_path: ".../Syn_data/fonts/arial.ttf"
+
+ ocr_model:
+ height: 32
+ width: 128
+ ocr_supervised: True
+ pretrained: ".../weights/ocr_model.pth"
+ optimize: false
+ max_length: 25
+ charset_path: ".../src/module/abinet/data/charset_36.txt"
+ iter_size: 3
+ ensemble: ''
+ use_vision: False
+ vision:
+ checkpoint:
+ loss_weight: 1.
+ attention: 'position'
+ backbone: 'transformer'
+ backbone_ln: 3
+ max_length: 25
+ charset_path: ".../src/module/abinet/data/charset_36.txt"
+ language:
+ checkpoint:
+ num_layers: 4
+ loss_weight: 1.
+ detach: True
+ use_self_attn: False
+ max_length: 25
+ charset_path: ".../src/module/abinet/data/charset_36.txt"
+ alignment:
+ loss_weight: 1.
+ max_length: 25
+ charset_path: ".../src/module/abinet/data/charset_36.txt"
+
+ vgg_weight: ".../weights/vgg19.pth"
+
+#####################################################Lightning##########################################
+lightning:
+ log_every_n_steps: 32
+ accumulate_grad_batches: 8
+ max_epochs: 100
+ accelerator: "gpu"
+ strategy: ddp
+ default_root_dir: "./logs"
+
+image_logger:
+ target: "src.trainer.Base.BaseImageLogger"
+ params:
+ train_batch_frequency: 2000
+ valid_batch_frequency: 2
+ disable_wandb: false
+ generation_kwargs:
+ num_inference_steps: 50
+ num_sample_per_image: 1
+ guidance_scale: 2
+ seed: 42
diff --git a/src/model/TextGen/diffusers/__init__.py b/src/model/TextGen/diffusers/__init__.py
new file mode 100644
index 0000000..86eda73
--- /dev/null
+++ b/src/model/TextGen/diffusers/__init__.py
@@ -0,0 +1,110 @@
+from .utils import (
+ is_flax_available,
+ is_inflect_available,
+ is_onnx_available,
+ is_scipy_available,
+ is_torch_available,
+ is_transformers_available,
+ is_unidecode_available,
+)
+
+
+__version__ = "0.8.0.dev0"
+
+from .configuration_utils import ConfigMixin
+from .onnx_utils import OnnxRuntimeModel
+from .utils import logging
+
+
+if is_torch_available():
+ from .modeling_utils import ModelMixin
+ from .models import AutoencoderKL, Transformer2DModel, UNet1DModel, UNet2DConditionModel, UNet2DModel, VQModel
+ from .optimization import (
+ get_constant_schedule,
+ get_constant_schedule_with_warmup,
+ get_cosine_schedule_with_warmup,
+ get_cosine_with_hard_restarts_schedule_with_warmup,
+ get_linear_schedule_with_warmup,
+ get_polynomial_decay_schedule_with_warmup,
+ get_scheduler,
+ )
+ from .pipeline_utils import DiffusionPipeline
+ from .pipelines import (
+ DanceDiffusionPipeline,
+ DDIMPipeline,
+ DDPMPipeline,
+ KarrasVePipeline,
+ LDMPipeline,
+ LDMSuperResolutionPipeline,
+ PNDMPipeline,
+ RePaintPipeline,
+ ScoreSdeVePipeline,
+ )
+ from .schedulers import (
+ DDIMScheduler,
+ DDPMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ IPNDMScheduler,
+ KarrasVeScheduler,
+ PNDMScheduler,
+ RePaintScheduler,
+ SchedulerMixin,
+ ScoreSdeVeScheduler,
+ VQDiffusionScheduler,
+ )
+ from .training_utils import EMAModel
+else:
+ from .utils.dummy_pt_objects import * # noqa F403
+
+if is_torch_available() and is_scipy_available():
+ from .schedulers import LMSDiscreteScheduler
+else:
+ from .utils.dummy_torch_and_scipy_objects import * # noqa F403
+
+if is_torch_available() and is_transformers_available():
+ from .pipelines import (
+ CycleDiffusionPipeline,
+ LDMTextToImagePipeline,
+ StableDiffusionImg2ImgPipeline,
+ StableDiffusionInpaintPipeline,
+ StableDiffusionInpaintPipelineLegacy,
+ StableDiffusionPipeline,
+ VQDiffusionPipeline,
+ )
+else:
+ from .utils.dummy_torch_and_transformers_objects import * # noqa F403
+
+if is_torch_available() and is_transformers_available() and is_onnx_available():
+ from .pipelines import (
+ OnnxStableDiffusionImg2ImgPipeline,
+ OnnxStableDiffusionInpaintPipeline,
+ OnnxStableDiffusionPipeline,
+ StableDiffusionOnnxPipeline,
+ )
+else:
+ from .utils.dummy_torch_and_transformers_and_onnx_objects import * # noqa F403
+
+if is_flax_available():
+ from .modeling_flax_utils import FlaxModelMixin
+ from .models.unet_2d_condition_flax import FlaxUNet2DConditionModel
+ from .models.vae_flax import FlaxAutoencoderKL
+ from .pipeline_flax_utils import FlaxDiffusionPipeline
+ from .schedulers import (
+ FlaxDDIMScheduler,
+ FlaxDDPMScheduler,
+ FlaxDPMSolverMultistepScheduler,
+ FlaxKarrasVeScheduler,
+ FlaxLMSDiscreteScheduler,
+ FlaxPNDMScheduler,
+ FlaxSchedulerMixin,
+ FlaxScoreSdeVeScheduler,
+ )
+else:
+ from .utils.dummy_flax_objects import * # noqa F403
+
+if is_flax_available() and is_transformers_available():
+ from .pipelines import FlaxStableDiffusionPipeline
+else:
+ from .utils.dummy_flax_and_transformers_objects import * # noqa F403
diff --git a/src/model/TextGen/diffusers/commands/__init__.py b/src/model/TextGen/diffusers/commands/__init__.py
new file mode 100644
index 0000000..902bd46
--- /dev/null
+++ b/src/model/TextGen/diffusers/commands/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from argparse import ArgumentParser
+
+
+class BaseDiffusersCLICommand(ABC):
+ @staticmethod
+ @abstractmethod
+ def register_subcommand(parser: ArgumentParser):
+ raise NotImplementedError()
+
+ @abstractmethod
+ def run(self):
+ raise NotImplementedError()
diff --git a/src/model/TextGen/diffusers/commands/diffusers_cli.py b/src/model/TextGen/diffusers/commands/diffusers_cli.py
new file mode 100644
index 0000000..30084e5
--- /dev/null
+++ b/src/model/TextGen/diffusers/commands/diffusers_cli.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser
+
+from .env import EnvironmentCommand
+
+
+def main():
+ parser = ArgumentParser("Diffusers CLI tool", usage="diffusers-cli []")
+ commands_parser = parser.add_subparsers(help="diffusers-cli command helpers")
+
+ # Register commands
+ EnvironmentCommand.register_subcommand(commands_parser)
+
+ # Let's go
+ args = parser.parse_args()
+
+ if not hasattr(args, "func"):
+ parser.print_help()
+ exit(1)
+
+ # Run
+ service = args.func(args)
+ service.run()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/model/TextGen/diffusers/commands/env.py b/src/model/TextGen/diffusers/commands/env.py
new file mode 100644
index 0000000..81a878b
--- /dev/null
+++ b/src/model/TextGen/diffusers/commands/env.py
@@ -0,0 +1,70 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import platform
+from argparse import ArgumentParser
+
+import huggingface_hub
+
+from .. import __version__ as version
+from ..utils import is_torch_available, is_transformers_available
+from . import BaseDiffusersCLICommand
+
+
+def info_command_factory(_):
+ return EnvironmentCommand()
+
+
+class EnvironmentCommand(BaseDiffusersCLICommand):
+ @staticmethod
+ def register_subcommand(parser: ArgumentParser):
+ download_parser = parser.add_parser("env")
+ download_parser.set_defaults(func=info_command_factory)
+
+ def run(self):
+ hub_version = huggingface_hub.__version__
+
+ pt_version = "not installed"
+ pt_cuda_available = "NA"
+ if is_torch_available():
+ import torch
+
+ pt_version = torch.__version__
+ pt_cuda_available = torch.cuda.is_available()
+
+ transformers_version = "not installed"
+ if is_transformers_available:
+ import transformers
+
+ transformers_version = transformers.__version__
+
+ info = {
+ "`diffusers` version": version,
+ "Platform": platform.platform(),
+ "Python version": platform.python_version(),
+ "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
+ "Huggingface_hub version": hub_version,
+ "Transformers version": transformers_version,
+ "Using GPU in script?": "",
+ "Using distributed or parallel set-up in script?": "",
+ }
+
+ print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
+ print(self.format_dict(info))
+
+ return info
+
+ @staticmethod
+ def format_dict(d):
+ return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"
diff --git a/src/model/TextGen/diffusers/configuration_utils.py b/src/model/TextGen/diffusers/configuration_utils.py
new file mode 100644
index 0000000..fc6ac9b
--- /dev/null
+++ b/src/model/TextGen/diffusers/configuration_utils.py
@@ -0,0 +1,520 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ConfigMixin base class and utilities."""
+import dataclasses
+import functools
+import importlib
+import inspect
+import json
+import os
+import re
+from collections import OrderedDict
+from typing import Any, Dict, Tuple, Union
+
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
+from requests import HTTPError
+
+from . import __version__
+from .utils import DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT, logging
+
+
+logger = logging.get_logger(__name__)
+
+_re_configuration_file = re.compile(r"config\.(.*)\.json")
+
+
+class ConfigMixin:
+ r"""
+ Base class for all configuration classes. Stores all configuration parameters under `self.config` Also handles all
+ methods for loading/downloading/saving classes inheriting from [`ConfigMixin`] with
+ - [`~ConfigMixin.from_config`]
+ - [`~ConfigMixin.save_config`]
+
+ Class attributes:
+ - **config_name** (`str`) -- A filename under which the config should stored when calling
+ [`~ConfigMixin.save_config`] (should be overridden by parent class).
+ - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
+ overridden by parent class).
+ - **_compatible_classes** (`List[str]`) -- A list of classes that are compatible with the parent class, so that
+ `from_config` can be used from a class different than the one used to save the config (should be overridden
+ by parent class).
+ """
+ config_name = None
+ ignore_for_config = []
+ _compatible_classes = []
+
+ def register_to_config(self, **kwargs):
+ if self.config_name is None:
+ raise NotImplementedError(f"Make sure that {self.__class__} has defined a class name `config_name`")
+ kwargs["_class_name"] = self.__class__.__name__
+ kwargs["_diffusers_version"] = __version__
+
+ # Special case for `kwargs` used in deprecation warning added to schedulers
+ # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
+ # or solve in a more general way.
+ kwargs.pop("kwargs", None)
+ for key, value in kwargs.items():
+ try:
+ setattr(self, key, value)
+ except AttributeError as err:
+ logger.error(f"Can't set {key} with value {value} for {self}")
+ raise err
+
+ if not hasattr(self, "_internal_dict"):
+ internal_dict = kwargs
+ else:
+ previous_dict = dict(self._internal_dict)
+ internal_dict = {**self._internal_dict, **kwargs}
+ logger.debug(f"Updating config from {previous_dict} to {internal_dict}")
+
+ self._internal_dict = FrozenDict(internal_dict)
+
+ def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+ """
+ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
+ [`~ConfigMixin.from_config`] class method.
+
+ Args:
+ save_directory (`str` or `os.PathLike`):
+ Directory where the configuration JSON file will be saved (will be created if it does not exist).
+ """
+ if os.path.isfile(save_directory):
+ raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+ os.makedirs(save_directory, exist_ok=True)
+
+ # If we save using the predefined names, we can load using `from_config`
+ output_config_file = os.path.join(save_directory, self.config_name)
+
+ self.to_json_file(output_config_file)
+ logger.info(f"Configuration saved in {output_config_file}")
+
+ @classmethod
+ def from_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs):
+ r"""
+ Instantiate a Python class from a pre-defined JSON-file.
+
+ Parameters:
+ pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+ Can be either:
+
+ - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
+ organization name, like `google/ddpm-celebahq-256`.
+ - A path to a *directory* containing model weights saved using [`~ConfigMixin.save_config`], e.g.,
+ `./my_model_directory/`.
+
+ cache_dir (`Union[str, os.PathLike]`, *optional*):
+ Path to a directory in which a downloaded pretrained model configuration should be cached if the
+ standard cache should not be used.
+ ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
+ Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
+ as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
+ checkpoint with 3 labels).
+ force_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+ cached versions if they exist.
+ resume_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+ file exists.
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+ 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+ output_loading_info(`bool`, *optional*, defaults to `False`):
+ Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+ local_files_only(`bool`, *optional*, defaults to `False`):
+ Whether or not to only look at local files (i.e., do not try to download the model).
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+ when running `transformers-cli login` (stored in `~/.huggingface`).
+ revision (`str`, *optional*, defaults to `"main"`):
+ The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+ identifier allowed by git.
+ subfolder (`str`, *optional*, defaults to `""`):
+ In case the relevant files are located inside a subfolder of the model repo (either remote in
+ huggingface.co or downloaded locally), you can specify the folder name here.
+
+
+
+ It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
+ models](https://huggingface.co/docs/hub/models-gated#gated-models).
+
+
+
+
+
+ Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
+ use this method in a firewalled environment.
+
+
+
+ """
+ config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
+ init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs)
+
+ # Allow dtype to be specified on initialization
+ if "dtype" in unused_kwargs:
+ init_dict["dtype"] = unused_kwargs.pop("dtype")
+
+ # Return model and optionally state and/or unused_kwargs
+ model = cls(**init_dict)
+ return_tuple = (model,)
+
+ # Flax schedulers have a state, so return it.
+ if cls.__name__.startswith("Flax") and hasattr(model, "create_state") and getattr(model, "has_state", False):
+ state = model.create_state()
+ return_tuple += (state,)
+
+ if return_unused_kwargs:
+ return return_tuple + (unused_kwargs,)
+ else:
+ return return_tuple if len(return_tuple) > 1 else model
+
+ @classmethod
+ def get_config_dict(
+ cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+ cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+ force_download = kwargs.pop("force_download", False)
+ resume_download = kwargs.pop("resume_download", False)
+ proxies = kwargs.pop("proxies", None)
+ use_auth_token = kwargs.pop("use_auth_token", None)
+ local_files_only = kwargs.pop("local_files_only", False)
+ revision = kwargs.pop("revision", None)
+ _ = kwargs.pop("mirror", None)
+ subfolder = kwargs.pop("subfolder", None)
+
+ user_agent = {"file_type": "config"}
+
+ pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+
+ if cls.config_name is None:
+ raise ValueError(
+ "`self.config_name` is not defined. Note that one should not load a config from "
+ "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
+ )
+
+ if os.path.isfile(pretrained_model_name_or_path):
+ config_file = pretrained_model_name_or_path
+ elif os.path.isdir(pretrained_model_name_or_path):
+ if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
+ # Load from a PyTorch checkpoint
+ config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
+ elif subfolder is not None and os.path.isfile(
+ os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+ ):
+ config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+ else:
+ raise EnvironmentError(
+ f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
+ )
+ else:
+ try:
+ # Load from URL or cache if already cached
+ config_file = hf_hub_download(
+ pretrained_model_name_or_path,
+ filename=cls.config_name,
+ cache_dir=cache_dir,
+ force_download=force_download,
+ proxies=proxies,
+ resume_download=resume_download,
+ local_files_only=local_files_only,
+ use_auth_token=use_auth_token,
+ user_agent=user_agent,
+ subfolder=subfolder,
+ revision=revision,
+ )
+
+ except RepositoryNotFoundError:
+ raise EnvironmentError(
+ f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier"
+ " listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a"
+ " token having permission to this repo with `use_auth_token` or log in with `huggingface-cli"
+ " login`."
+ )
+ except RevisionNotFoundError:
+ raise EnvironmentError(
+ f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for"
+ " this model name. Check the model page at"
+ f" 'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+ )
+ except EntryNotFoundError:
+ raise EnvironmentError(
+ f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}."
+ )
+ except HTTPError as err:
+ raise EnvironmentError(
+ "There was a specific connection error when trying to load"
+ f" {pretrained_model_name_or_path}:\n{err}"
+ )
+ except ValueError:
+ raise EnvironmentError(
+ f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+ f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+ f" directory containing a {cls.config_name} file.\nCheckout your internet connection or see how to"
+ " run the library in offline mode at"
+ " 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
+ )
+ except EnvironmentError:
+ raise EnvironmentError(
+ f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+ "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+ f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+ f"containing a {cls.config_name} file"
+ )
+
+ try:
+ # Load config dict
+ config_dict = cls._dict_from_json_file(config_file)
+ except (json.JSONDecodeError, UnicodeDecodeError):
+ raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
+
+ return config_dict
+
+ @staticmethod
+ def _get_init_keys(cls):
+ return set(dict(inspect.signature(cls.__init__).parameters).keys())
+
+ @classmethod
+ def extract_init_dict(cls, config_dict, **kwargs):
+ # 1. Retrieve expected config attributes from __init__ signature
+ expected_keys = cls._get_init_keys(cls)
+ expected_keys.remove("self")
+ # remove general kwargs if present in dict
+ if "kwargs" in expected_keys:
+ expected_keys.remove("kwargs")
+ # remove flax internal keys
+ if hasattr(cls, "_flax_internal_args"):
+ for arg in cls._flax_internal_args:
+ expected_keys.remove(arg)
+
+ # 2. Remove attributes that cannot be expected from expected config attributes
+ # remove keys to be ignored
+ if len(cls.ignore_for_config) > 0:
+ expected_keys = expected_keys - set(cls.ignore_for_config)
+
+ # load diffusers library to import compatible and original scheduler
+ diffusers_library = importlib.import_module(__name__.split(".")[0])
+
+ # remove attributes from compatible classes that orig cannot expect
+ compatible_classes = [getattr(diffusers_library, c, None) for c in cls._compatible_classes]
+ # filter out None potentially undefined dummy classes
+ compatible_classes = [c for c in compatible_classes if c is not None]
+ expected_keys_comp_cls = set()
+ for c in compatible_classes:
+ expected_keys_c = cls._get_init_keys(c)
+ expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)
+ expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)
+ config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}
+
+ # remove attributes from orig class that cannot be expected
+ orig_cls_name = config_dict.pop("_class_name", cls.__name__)
+ if orig_cls_name != cls.__name__ and hasattr(diffusers_library, orig_cls_name):
+ orig_cls = getattr(diffusers_library, orig_cls_name)
+ unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys
+ config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}
+
+ # remove private attributes
+ config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
+
+ # 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments
+ init_dict = {}
+ for key in expected_keys:
+ # if config param is passed to kwarg and is present in config dict
+ # it should overwrite existing config dict key
+ if key in kwargs and key in config_dict:
+ config_dict[key] = kwargs.pop(key)
+
+ if key in kwargs:
+ # overwrite key
+ init_dict[key] = kwargs.pop(key)
+ elif key in config_dict:
+ # use value from config dict
+ init_dict[key] = config_dict.pop(key)
+
+ # 4. Give nice warning if unexpected values have been passed
+ if len(config_dict) > 0:
+ logger.warning(
+ f"The config attributes {config_dict} were passed to {cls.__name__}, "
+ "but are not expected and will be ignored. Please verify your "
+ f"{cls.config_name} configuration file."
+ )
+
+ # 5. Give nice info if config attributes are initiliazed to default because they have not been passed
+ passed_keys = set(init_dict.keys())
+ if len(expected_keys - passed_keys) > 0:
+ logger.info(
+ f"{expected_keys - passed_keys} was not found in config. Values will be initialized to default values."
+ )
+
+ # 6. Define unused keyword arguments
+ unused_kwargs = {**config_dict, **kwargs}
+
+ return init_dict, unused_kwargs
+
+ @classmethod
+ def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
+ with open(json_file, "r", encoding="utf-8") as reader:
+ text = reader.read()
+ return json.loads(text)
+
+ def __repr__(self):
+ return f"{self.__class__.__name__} {self.to_json_string()}"
+
+ @property
+ def config(self) -> Dict[str, Any]:
+ return self._internal_dict
+
+ def to_json_string(self) -> str:
+ """
+ Serializes this instance to a JSON string.
+
+ Returns:
+ `str`: String containing all the attributes that make up this configuration instance in JSON format.
+ """
+ config_dict = self._internal_dict if hasattr(self, "_internal_dict") else {}
+ return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+ def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+ """
+ Save this instance to a JSON file.
+
+ Args:
+ json_file_path (`str` or `os.PathLike`):
+ Path to the JSON file in which this configuration instance's parameters will be saved.
+ """
+ with open(json_file_path, "w", encoding="utf-8") as writer:
+ writer.write(self.to_json_string())
+
+
+class FrozenDict(OrderedDict):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ for key, value in self.items():
+ setattr(self, key, value)
+
+ self.__frozen = True
+
+ def __delitem__(self, *args, **kwargs):
+ raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+ def setdefault(self, *args, **kwargs):
+ raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+ def pop(self, *args, **kwargs):
+ raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+ def update(self, *args, **kwargs):
+ raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+ def __setattr__(self, name, value):
+ if hasattr(self, "__frozen") and self.__frozen:
+ raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
+ super().__setattr__(name, value)
+
+ def __setitem__(self, name, value):
+ if hasattr(self, "__frozen") and self.__frozen:
+ raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
+ super().__setitem__(name, value)
+
+
+def register_to_config(init):
+ r"""
+ Decorator to apply on the init of classes inheriting from [`ConfigMixin`] so that all the arguments are
+ automatically sent to `self.register_for_config`. To ignore a specific argument accepted by the init but that
+ shouldn't be registered in the config, use the `ignore_for_config` class variable
+
+ Warning: Once decorated, all private arguments (beginning with an underscore) are trashed and not sent to the init!
+ """
+
+ @functools.wraps(init)
+ def inner_init(self, *args, **kwargs):
+ # Ignore private kwargs in the init.
+ init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
+ init(self, *args, **init_kwargs)
+ if not isinstance(self, ConfigMixin):
+ raise RuntimeError(
+ f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
+ "not inherit from `ConfigMixin`."
+ )
+
+ ignore = getattr(self, "ignore_for_config", [])
+ # Get positional arguments aligned with kwargs
+ new_kwargs = {}
+ signature = inspect.signature(init)
+ parameters = {
+ name: p.default for i, (name, p) in enumerate(signature.parameters.items()) if i > 0 and name not in ignore
+ }
+ for arg, name in zip(args, parameters.keys()):
+ new_kwargs[name] = arg
+
+ # Then add all kwargs
+ new_kwargs.update(
+ {
+ k: init_kwargs.get(k, default)
+ for k, default in parameters.items()
+ if k not in ignore and k not in new_kwargs
+ }
+ )
+ getattr(self, "register_to_config")(**new_kwargs)
+
+ return inner_init
+
+
+def flax_register_to_config(cls):
+ original_init = cls.__init__
+
+ @functools.wraps(original_init)
+ def init(self, *args, **kwargs):
+ if not isinstance(self, ConfigMixin):
+ raise RuntimeError(
+ f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
+ "not inherit from `ConfigMixin`."
+ )
+
+ # Ignore private kwargs in the init. Retrieve all passed attributes
+ init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
+
+ # Retrieve default values
+ fields = dataclasses.fields(self)
+ default_kwargs = {}
+ for field in fields:
+ # ignore flax specific attributes
+ if field.name in self._flax_internal_args:
+ continue
+ if type(field.default) == dataclasses._MISSING_TYPE:
+ default_kwargs[field.name] = None
+ else:
+ default_kwargs[field.name] = getattr(self, field.name)
+
+ # Make sure init_kwargs override default kwargs
+ new_kwargs = {**default_kwargs, **init_kwargs}
+ # dtype should be part of `init_kwargs`, but not `new_kwargs`
+ if "dtype" in new_kwargs:
+ new_kwargs.pop("dtype")
+
+ # Get positional arguments aligned with kwargs
+ for i, arg in enumerate(args):
+ name = fields[i].name
+ new_kwargs[name] = arg
+
+ getattr(self, "register_to_config")(**new_kwargs)
+ original_init(self, *args, **kwargs)
+
+ cls.__init__ = init
+ return cls
diff --git a/src/model/TextGen/diffusers/dependency_versions_check.py b/src/model/TextGen/diffusers/dependency_versions_check.py
new file mode 100644
index 0000000..bbf8632
--- /dev/null
+++ b/src/model/TextGen/diffusers/dependency_versions_check.py
@@ -0,0 +1,47 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+from .dependency_versions_table import deps
+from .utils.versions import require_version, require_version_core
+
+
+# define which module versions we always want to check at run time
+# (usually the ones defined in `install_requires` in setup.py)
+#
+# order specific notes:
+# - tqdm must be checked before tokenizers
+
+pkgs_to_check_at_runtime = "python tqdm regex requests packaging filelock numpy tokenizers".split()
+if sys.version_info < (3, 7):
+ pkgs_to_check_at_runtime.append("dataclasses")
+if sys.version_info < (3, 8):
+ pkgs_to_check_at_runtime.append("importlib_metadata")
+
+for pkg in pkgs_to_check_at_runtime:
+ if pkg in deps:
+ if pkg == "tokenizers":
+ # must be loaded here, or else tqdm check may fail
+ from .utils import is_tokenizers_available
+
+ if not is_tokenizers_available():
+ continue # not required, check version only if installed
+
+ require_version_core(deps[pkg])
+ else:
+ raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py")
+
+
+def dep_version_check(pkg, hint=None):
+ require_version(deps[pkg], hint)
diff --git a/src/model/TextGen/diffusers/dependency_versions_table.py b/src/model/TextGen/diffusers/dependency_versions_table.py
new file mode 100644
index 0000000..59e13da
--- /dev/null
+++ b/src/model/TextGen/diffusers/dependency_versions_table.py
@@ -0,0 +1,31 @@
+# THIS FILE HAS BEEN AUTOGENERATED. To update:
+# 1. modify the `_deps` dict in setup.py
+# 2. run `make deps_table_update``
+deps = {
+ "Pillow": "Pillow<10.0",
+ "accelerate": "accelerate>=0.11.0",
+ "black": "black==22.8",
+ "datasets": "datasets",
+ "filelock": "filelock",
+ "flake8": "flake8>=3.8.3",
+ "flax": "flax>=0.4.1",
+ "hf-doc-builder": "hf-doc-builder>=0.3.0",
+ "huggingface-hub": "huggingface-hub>=0.10.0",
+ "importlib_metadata": "importlib_metadata",
+ "isort": "isort>=5.5.4",
+ "jax": "jax>=0.2.8,!=0.3.2",
+ "jaxlib": "jaxlib>=0.1.65",
+ "modelcards": "modelcards>=0.1.4",
+ "numpy": "numpy",
+ "parameterized": "parameterized",
+ "pytest": "pytest",
+ "pytest-timeout": "pytest-timeout",
+ "pytest-xdist": "pytest-xdist",
+ "scipy": "scipy",
+ "regex": "regex!=2019.12.17",
+ "requests": "requests",
+ "tensorboard": "tensorboard",
+ "torch": "torch>=1.4",
+ "torchvision": "torchvision",
+ "transformers": "transformers>=4.21.0",
+}
diff --git a/src/model/TextGen/diffusers/dynamic_modules_utils.py b/src/model/TextGen/diffusers/dynamic_modules_utils.py
new file mode 100644
index 0000000..31f3bed
--- /dev/null
+++ b/src/model/TextGen/diffusers/dynamic_modules_utils.py
@@ -0,0 +1,428 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities to dynamically load objects from the Hub."""
+
+import importlib
+import inspect
+import os
+import re
+import shutil
+import sys
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+from huggingface_hub import HfFolder, cached_download, hf_hub_download, model_info
+
+from .utils import DIFFUSERS_DYNAMIC_MODULE_NAME, HF_MODULES_CACHE, logging
+
+
+COMMUNITY_PIPELINES_URL = (
+ "https://raw.githubusercontent.com/huggingface/diffusers/main/examples/community/{pipeline}.py"
+)
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+def init_hf_modules():
+ """
+ Creates the cache directory for modules with an init, and adds it to the Python path.
+ """
+ # This function has already been executed if HF_MODULES_CACHE already is in the Python path.
+ if HF_MODULES_CACHE in sys.path:
+ return
+
+ sys.path.append(HF_MODULES_CACHE)
+ os.makedirs(HF_MODULES_CACHE, exist_ok=True)
+ init_path = Path(HF_MODULES_CACHE) / "__init__.py"
+ if not init_path.exists():
+ init_path.touch()
+
+
+def create_dynamic_module(name: Union[str, os.PathLike]):
+ """
+ Creates a dynamic module in the cache directory for modules.
+ """
+ init_hf_modules()
+ dynamic_module_path = Path(HF_MODULES_CACHE) / name
+ # If the parent module does not exist yet, recursively create it.
+ if not dynamic_module_path.parent.exists():
+ create_dynamic_module(dynamic_module_path.parent)
+ os.makedirs(dynamic_module_path, exist_ok=True)
+ init_path = dynamic_module_path / "__init__.py"
+ if not init_path.exists():
+ init_path.touch()
+
+
+def get_relative_imports(module_file):
+ """
+ Get the list of modules that are relatively imported in a module file.
+
+ Args:
+ module_file (`str` or `os.PathLike`): The module file to inspect.
+ """
+ with open(module_file, "r", encoding="utf-8") as f:
+ content = f.read()
+
+ # Imports of the form `import .xxx`
+ relative_imports = re.findall("^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE)
+ # Imports of the form `from .xxx import yyy`
+ relative_imports += re.findall("^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE)
+ # Unique-ify
+ return list(set(relative_imports))
+
+
+def get_relative_import_files(module_file):
+ """
+ Get the list of all files that are needed for a given module. Note that this function recurses through the relative
+ imports (if a imports b and b imports c, it will return module files for b and c).
+
+ Args:
+ module_file (`str` or `os.PathLike`): The module file to inspect.
+ """
+ no_change = False
+ files_to_check = [module_file]
+ all_relative_imports = []
+
+ # Let's recurse through all relative imports
+ while not no_change:
+ new_imports = []
+ for f in files_to_check:
+ new_imports.extend(get_relative_imports(f))
+
+ module_path = Path(module_file).parent
+ new_import_files = [str(module_path / m) for m in new_imports]
+ new_import_files = [f for f in new_import_files if f not in all_relative_imports]
+ files_to_check = [f"{f}.py" for f in new_import_files]
+
+ no_change = len(new_import_files) == 0
+ all_relative_imports.extend(files_to_check)
+
+ return all_relative_imports
+
+
+def check_imports(filename):
+ """
+ Check if the current Python environment contains all the libraries that are imported in a file.
+ """
+ with open(filename, "r", encoding="utf-8") as f:
+ content = f.read()
+
+ # Imports of the form `import xxx`
+ imports = re.findall("^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
+ # Imports of the form `from xxx import yyy`
+ imports += re.findall("^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
+ # Only keep the top-level module
+ imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")]
+
+ # Unique-ify and test we got them all
+ imports = list(set(imports))
+ missing_packages = []
+ for imp in imports:
+ try:
+ importlib.import_module(imp)
+ except ImportError:
+ missing_packages.append(imp)
+
+ if len(missing_packages) > 0:
+ raise ImportError(
+ "This modeling file requires the following packages that were not found in your environment: "
+ f"{', '.join(missing_packages)}. Run `pip install {' '.join(missing_packages)}`"
+ )
+
+ return get_relative_imports(filename)
+
+
+def get_class_in_module(class_name, module_path):
+ """
+ Import a module on the cache directory for modules and extract a class from it.
+ """
+ module_path = module_path.replace(os.path.sep, ".")
+ module = importlib.import_module(module_path)
+
+ if class_name is None:
+ return find_pipeline_class(module)
+ return getattr(module, class_name)
+
+
+def find_pipeline_class(loaded_module):
+ """
+ Retrieve pipeline class that inherits from `DiffusionPipeline`. Note that there has to be exactly one class
+ inheriting from `DiffusionPipeline`.
+ """
+ from .pipeline_utils import DiffusionPipeline
+
+ cls_members = dict(inspect.getmembers(loaded_module, inspect.isclass))
+
+ pipeline_class = None
+ for cls_name, cls in cls_members.items():
+ if (
+ cls_name != DiffusionPipeline.__name__
+ and issubclass(cls, DiffusionPipeline)
+ and cls.__module__.split(".")[0] != "diffusers"
+ ):
+ if pipeline_class is not None:
+ raise ValueError(
+ f"Multiple classes that inherit from {DiffusionPipeline.__name__} have been found:"
+ f" {pipeline_class.__name__}, and {cls_name}. Please make sure to define only one in"
+ f" {loaded_module}."
+ )
+ pipeline_class = cls
+
+ return pipeline_class
+
+
+def get_cached_module_file(
+ pretrained_model_name_or_path: Union[str, os.PathLike],
+ module_file: str,
+ cache_dir: Optional[Union[str, os.PathLike]] = None,
+ force_download: bool = False,
+ resume_download: bool = False,
+ proxies: Optional[Dict[str, str]] = None,
+ use_auth_token: Optional[Union[bool, str]] = None,
+ revision: Optional[str] = None,
+ local_files_only: bool = False,
+):
+ """
+ Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached
+ Transformers module.
+
+ Args:
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
+ This can be either:
+
+ - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
+ under a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - a path to a *directory* containing a configuration file saved using the
+ [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+
+ module_file (`str`):
+ The name of the module file containing the class to look for.
+ cache_dir (`str` or `os.PathLike`, *optional*):
+ Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+ cache should not be used.
+ force_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to force to (re-)download the configuration files and override the cached versions if they
+ exist.
+ resume_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+ 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+ when running `transformers-cli login` (stored in `~/.huggingface`).
+ revision (`str`, *optional*, defaults to `"main"`):
+ The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+ identifier allowed by git.
+ local_files_only (`bool`, *optional*, defaults to `False`):
+ If `True`, will only try to load the tokenizer configuration from local files.
+
+
+
+ You may pass a token in `use_auth_token` if you are not logged in (`huggingface-cli long`) and want to use private
+ or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
+
+
+
+ Returns:
+ `str`: The path to the module inside the cache.
+ """
+ # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
+ pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+
+ module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
+
+ if os.path.isfile(module_file_or_url):
+ resolved_module_file = module_file_or_url
+ submodule = "local"
+ elif pretrained_model_name_or_path.count("/") == 0:
+ # community pipeline on GitHub
+ github_url = COMMUNITY_PIPELINES_URL.format(pipeline=pretrained_model_name_or_path)
+ try:
+ resolved_module_file = cached_download(
+ github_url,
+ cache_dir=cache_dir,
+ force_download=force_download,
+ proxies=proxies,
+ resume_download=resume_download,
+ local_files_only=local_files_only,
+ use_auth_token=False,
+ )
+ submodule = "git"
+ module_file = pretrained_model_name_or_path + ".py"
+ except EnvironmentError:
+ logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
+ raise
+ else:
+ try:
+ # Load from URL or cache if already cached
+ resolved_module_file = hf_hub_download(
+ pretrained_model_name_or_path,
+ module_file,
+ cache_dir=cache_dir,
+ force_download=force_download,
+ proxies=proxies,
+ resume_download=resume_download,
+ local_files_only=local_files_only,
+ use_auth_token=use_auth_token,
+ )
+ submodule = os.path.join("local", "--".join(pretrained_model_name_or_path.split("/")))
+ except EnvironmentError:
+ logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
+ raise
+
+ # Check we have all the requirements in our environment
+ modules_needed = check_imports(resolved_module_file)
+
+ # Now we move the module inside our cached dynamic modules.
+ full_submodule = DIFFUSERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule
+ create_dynamic_module(full_submodule)
+ submodule_path = Path(HF_MODULES_CACHE) / full_submodule
+ if submodule == "local" or submodule == "git":
+ # We always copy local files (we could hash the file to see if there was a change, and give them the name of
+ # that hash, to only copy when there is a modification but it seems overkill for now).
+ # The only reason we do the copy is to avoid putting too many folders in sys.path.
+ shutil.copy(resolved_module_file, submodule_path / module_file)
+ for module_needed in modules_needed:
+ module_needed = f"{module_needed}.py"
+ shutil.copy(os.path.join(pretrained_model_name_or_path, module_needed), submodule_path / module_needed)
+ else:
+ # Get the commit hash
+ # TODO: we will get this info in the etag soon, so retrieve it from there and not here.
+ if isinstance(use_auth_token, str):
+ token = use_auth_token
+ elif use_auth_token is True:
+ token = HfFolder.get_token()
+ else:
+ token = None
+
+ commit_hash = model_info(pretrained_model_name_or_path, revision=revision, token=token).sha
+
+ # The module file will end up being placed in a subfolder with the git hash of the repo. This way we get the
+ # benefit of versioning.
+ submodule_path = submodule_path / commit_hash
+ full_submodule = full_submodule + os.path.sep + commit_hash
+ create_dynamic_module(full_submodule)
+
+ if not (submodule_path / module_file).exists():
+ shutil.copy(resolved_module_file, submodule_path / module_file)
+ # Make sure we also have every file with relative
+ for module_needed in modules_needed:
+ if not (submodule_path / module_needed).exists():
+ get_cached_module_file(
+ pretrained_model_name_or_path,
+ f"{module_needed}.py",
+ cache_dir=cache_dir,
+ force_download=force_download,
+ resume_download=resume_download,
+ proxies=proxies,
+ use_auth_token=use_auth_token,
+ revision=revision,
+ local_files_only=local_files_only,
+ )
+ return os.path.join(full_submodule, module_file)
+
+
+def get_class_from_dynamic_module(
+ pretrained_model_name_or_path: Union[str, os.PathLike],
+ module_file: str,
+ class_name: Optional[str] = None,
+ cache_dir: Optional[Union[str, os.PathLike]] = None,
+ force_download: bool = False,
+ resume_download: bool = False,
+ proxies: Optional[Dict[str, str]] = None,
+ use_auth_token: Optional[Union[bool, str]] = None,
+ revision: Optional[str] = None,
+ local_files_only: bool = False,
+ **kwargs,
+):
+ """
+ Extracts a class from a module file, present in the local folder or repository of a model.
+
+
+
+ Calling this function will execute the code in the module file found locally or downloaded from the Hub. It should
+ therefore only be called on trusted repos.
+
+
+
+ Args:
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
+ This can be either:
+
+ - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+ huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
+ under a user or organization name, like `dbmdz/bert-base-german-cased`.
+ - a path to a *directory* containing a configuration file saved using the
+ [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+
+ module_file (`str`):
+ The name of the module file containing the class to look for.
+ class_name (`str`):
+ The name of the class to import in the module.
+ cache_dir (`str` or `os.PathLike`, *optional*):
+ Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+ cache should not be used.
+ force_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to force to (re-)download the configuration files and override the cached versions if they
+ exist.
+ resume_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+ 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+ use_auth_token (`str` or `bool`, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+ when running `transformers-cli login` (stored in `~/.huggingface`).
+ revision (`str`, *optional*, defaults to `"main"`):
+ The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+ identifier allowed by git.
+ local_files_only (`bool`, *optional*, defaults to `False`):
+ If `True`, will only try to load the tokenizer configuration from local files.
+
+
+
+ You may pass a token in `use_auth_token` if you are not logged in (`huggingface-cli long`) and want to use private
+ or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
+
+
+
+ Returns:
+ `type`: The class, dynamically imported from the module.
+
+ Examples:
+
+ ```python
+ # Download module `modeling.py` from huggingface.co and cache then extract the class `MyBertModel` from this
+ # module.
+ cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel")
+ ```"""
+ # And lastly we get the class inside our newly created module
+ final_module = get_cached_module_file(
+ pretrained_model_name_or_path,
+ module_file,
+ cache_dir=cache_dir,
+ force_download=force_download,
+ resume_download=resume_download,
+ proxies=proxies,
+ use_auth_token=use_auth_token,
+ revision=revision,
+ local_files_only=local_files_only,
+ )
+ return get_class_in_module(class_name, final_module.replace(".py", ""))
diff --git a/src/model/TextGen/diffusers/hub_utils.py b/src/model/TextGen/diffusers/hub_utils.py
new file mode 100644
index 0000000..8bf0933
--- /dev/null
+++ b/src/model/TextGen/diffusers/hub_utils.py
@@ -0,0 +1,246 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import shutil
+import sys
+from pathlib import Path
+from typing import Dict, Optional, Union
+from uuid import uuid4
+
+from huggingface_hub import HfFolder, Repository, whoami
+
+from . import __version__
+from .utils import ENV_VARS_TRUE_VALUES, deprecate, logging
+from .utils.import_utils import (
+ _flax_version,
+ _jax_version,
+ _onnxruntime_version,
+ _torch_version,
+ is_flax_available,
+ is_modelcards_available,
+ is_onnx_available,
+ is_torch_available,
+)
+
+
+if is_modelcards_available():
+ from modelcards import CardData, ModelCard
+
+
+logger = logging.get_logger(__name__)
+
+
+MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "utils" / "model_card_template.md"
+SESSION_ID = uuid4().hex
+DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", "").upper() in ENV_VARS_TRUE_VALUES
+
+
+def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
+ """
+ Formats a user-agent string with basic info about a request.
+ """
+ ua = f"diffusers/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}"
+ if DISABLE_TELEMETRY:
+ return ua + "; telemetry/off"
+ if is_torch_available():
+ ua += f"; torch/{_torch_version}"
+ if is_flax_available():
+ ua += f"; jax/{_jax_version}"
+ ua += f"; flax/{_flax_version}"
+ if is_onnx_available():
+ ua += f"; onnxruntime/{_onnxruntime_version}"
+ # CI will set this value to True
+ if os.environ.get("DIFFUSERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
+ ua += "; is_ci/true"
+ if isinstance(user_agent, dict):
+ ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items())
+ elif isinstance(user_agent, str):
+ ua += "; " + user_agent
+ return ua
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+ if token is None:
+ token = HfFolder.get_token()
+ if organization is None:
+ username = whoami(token)["name"]
+ return f"{username}/{model_id}"
+ else:
+ return f"{organization}/{model_id}"
+
+
+def init_git_repo(args, at_init: bool = False):
+ """
+ Args:
+ Initializes a git repo in `args.hub_model_id`.
+ at_init (`bool`, *optional*, defaults to `False`):
+ Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is `True`
+ and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped out.
+ """
+ deprecation_message = (
+ "Please use `huggingface_hub.Repository`. "
+ "See `examples/unconditional_image_generation/train_unconditional.py` for an example."
+ )
+ deprecate("init_git_repo()", "0.10.0", deprecation_message)
+
+ if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
+ return
+ hub_token = args.hub_token if hasattr(args, "hub_token") else None
+ use_auth_token = True if hub_token is None else hub_token
+ if not hasattr(args, "hub_model_id") or args.hub_model_id is None:
+ repo_name = Path(args.output_dir).absolute().name
+ else:
+ repo_name = args.hub_model_id
+ if "/" not in repo_name:
+ repo_name = get_full_repo_name(repo_name, token=hub_token)
+
+ try:
+ repo = Repository(
+ args.output_dir,
+ clone_from=repo_name,
+ use_auth_token=use_auth_token,
+ private=args.hub_private_repo,
+ )
+ except EnvironmentError:
+ if args.overwrite_output_dir and at_init:
+ # Try again after wiping output_dir
+ shutil.rmtree(args.output_dir)
+ repo = Repository(
+ args.output_dir,
+ clone_from=repo_name,
+ use_auth_token=use_auth_token,
+ )
+ else:
+ raise
+
+ repo.git_pull()
+
+ # By default, ignore the checkpoint folders
+ if not os.path.exists(os.path.join(args.output_dir, ".gitignore")):
+ with open(os.path.join(args.output_dir, ".gitignore"), "w", encoding="utf-8") as writer:
+ writer.writelines(["checkpoint-*/"])
+
+ return repo
+
+
+def push_to_hub(
+ args,
+ pipeline,
+ repo: Repository,
+ commit_message: Optional[str] = "End of training",
+ blocking: bool = True,
+ **kwargs,
+) -> str:
+ """
+ Parameters:
+ Upload *self.model* and *self.tokenizer* to the 🤗 model hub on the repo *self.args.hub_model_id*.
+ commit_message (`str`, *optional*, defaults to `"End of training"`):
+ Message to commit while pushing.
+ blocking (`bool`, *optional*, defaults to `True`):
+ Whether the function should return only when the `git push` has finished.
+ kwargs:
+ Additional keyword arguments passed along to [`create_model_card`].
+ Returns:
+ The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url of the
+ commit and an object to track the progress of the commit if `blocking=True`
+ """
+ deprecation_message = (
+ "Please use `huggingface_hub.Repository` and `Repository.push_to_hub()`. "
+ "See `examples/unconditional_image_generation/train_unconditional.py` for an example."
+ )
+ deprecate("push_to_hub()", "0.10.0", deprecation_message)
+
+ if not hasattr(args, "hub_model_id") or args.hub_model_id is None:
+ model_name = Path(args.output_dir).name
+ else:
+ model_name = args.hub_model_id.split("/")[-1]
+
+ output_dir = args.output_dir
+ os.makedirs(output_dir, exist_ok=True)
+ logger.info(f"Saving pipeline checkpoint to {output_dir}")
+ pipeline.save_pretrained(output_dir)
+
+ # Only push from one node.
+ if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
+ return
+
+ # Cancel any async push in progress if blocking=True. The commits will all be pushed together.
+ if (
+ blocking
+ and len(repo.command_queue) > 0
+ and repo.command_queue[-1] is not None
+ and not repo.command_queue[-1].is_done
+ ):
+ repo.command_queue[-1]._process.kill()
+
+ git_head_commit_url = repo.push_to_hub(commit_message=commit_message, blocking=blocking, auto_lfs_prune=True)
+ # push separately the model card to be independent from the rest of the model
+ create_model_card(args, model_name=model_name)
+ try:
+ repo.push_to_hub(commit_message="update model card README.md", blocking=blocking, auto_lfs_prune=True)
+ except EnvironmentError as exc:
+ logger.error(f"Error pushing update to the model card. Please read logs and retry.\n${exc}")
+
+ return git_head_commit_url
+
+
+def create_model_card(args, model_name):
+ if not is_modelcards_available:
+ raise ValueError(
+ "Please make sure to have `modelcards` installed when using the `create_model_card` function. You can"
+ " install the package with `pip install modelcards`."
+ )
+
+ if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
+ return
+
+ hub_token = args.hub_token if hasattr(args, "hub_token") else None
+ repo_name = get_full_repo_name(model_name, token=hub_token)
+
+ model_card = ModelCard.from_template(
+ card_data=CardData( # Card metadata object that will be converted to YAML block
+ language="en",
+ license="apache-2.0",
+ library_name="diffusers",
+ tags=[],
+ datasets=args.dataset_name,
+ metrics=[],
+ ),
+ template_path=MODEL_CARD_TEMPLATE_PATH,
+ model_name=model_name,
+ repo_name=repo_name,
+ dataset_name=args.dataset_name if hasattr(args, "dataset_name") else None,
+ learning_rate=args.learning_rate,
+ train_batch_size=args.train_batch_size,
+ eval_batch_size=args.eval_batch_size,
+ gradient_accumulation_steps=args.gradient_accumulation_steps
+ if hasattr(args, "gradient_accumulation_steps")
+ else None,
+ adam_beta1=args.adam_beta1 if hasattr(args, "adam_beta1") else None,
+ adam_beta2=args.adam_beta2 if hasattr(args, "adam_beta2") else None,
+ adam_weight_decay=args.adam_weight_decay if hasattr(args, "adam_weight_decay") else None,
+ adam_epsilon=args.adam_epsilon if hasattr(args, "adam_epsilon") else None,
+ lr_scheduler=args.lr_scheduler if hasattr(args, "lr_scheduler") else None,
+ lr_warmup_steps=args.lr_warmup_steps if hasattr(args, "lr_warmup_steps") else None,
+ ema_inv_gamma=args.ema_inv_gamma if hasattr(args, "ema_inv_gamma") else None,
+ ema_power=args.ema_power if hasattr(args, "ema_power") else None,
+ ema_max_decay=args.ema_max_decay if hasattr(args, "ema_max_decay") else None,
+ mixed_precision=args.mixed_precision,
+ )
+
+ card_path = os.path.join(args.output_dir, "README.md")
+ model_card.save(card_path)
diff --git a/src/model/TextGen/diffusers/modeling_flax_pytorch_utils.py b/src/model/TextGen/diffusers/modeling_flax_pytorch_utils.py
new file mode 100644
index 0000000..9c7a5de
--- /dev/null
+++ b/src/model/TextGen/diffusers/modeling_flax_pytorch_utils.py
@@ -0,0 +1,117 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch - Flax general utilities."""
+import re
+
+import jax.numpy as jnp
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax.random import PRNGKey
+
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def rename_key(key):
+ regex = r"\w+[.]\d+"
+ pats = re.findall(regex, key)
+ for pat in pats:
+ key = key.replace(pat, "_".join(pat.split(".")))
+ return key
+
+
+#####################
+# PyTorch => Flax #
+#####################
+
+# Adapted from https://github.com/huggingface/transformers/blob/c603c80f46881ae18b2ca50770ef65fa4033eacd/src/transformers/modeling_flax_pytorch_utils.py#L69
+# and https://github.com/patil-suraj/stable-diffusion-jax/blob/main/stable_diffusion_jax/convert_diffusers_to_jax.py
+def rename_key_and_reshape_tensor(pt_tuple_key, pt_tensor, random_flax_state_dict):
+ """Rename PT weight names to corresponding Flax weight names and reshape tensor if necessary"""
+
+ # conv norm or layer norm
+ renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
+ if (
+ any("norm" in str_ for str_ in pt_tuple_key)
+ and (pt_tuple_key[-1] == "bias")
+ and (pt_tuple_key[:-1] + ("bias",) not in random_flax_state_dict)
+ and (pt_tuple_key[:-1] + ("scale",) in random_flax_state_dict)
+ ):
+ renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
+ return renamed_pt_tuple_key, pt_tensor
+ elif pt_tuple_key[-1] in ["weight", "gamma"] and pt_tuple_key[:-1] + ("scale",) in random_flax_state_dict:
+ renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
+ return renamed_pt_tuple_key, pt_tensor
+
+ # embedding
+ if pt_tuple_key[-1] == "weight" and pt_tuple_key[:-1] + ("embedding",) in random_flax_state_dict:
+ pt_tuple_key = pt_tuple_key[:-1] + ("embedding",)
+ return renamed_pt_tuple_key, pt_tensor
+
+ # conv layer
+ renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
+ if pt_tuple_key[-1] == "weight" and pt_tensor.ndim == 4:
+ pt_tensor = pt_tensor.transpose(2, 3, 1, 0)
+ return renamed_pt_tuple_key, pt_tensor
+
+ # linear layer
+ renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
+ if pt_tuple_key[-1] == "weight":
+ pt_tensor = pt_tensor.T
+ return renamed_pt_tuple_key, pt_tensor
+
+ # old PyTorch layer norm weight
+ renamed_pt_tuple_key = pt_tuple_key[:-1] + ("weight",)
+ if pt_tuple_key[-1] == "gamma":
+ return renamed_pt_tuple_key, pt_tensor
+
+ # old PyTorch layer norm bias
+ renamed_pt_tuple_key = pt_tuple_key[:-1] + ("bias",)
+ if pt_tuple_key[-1] == "beta":
+ return renamed_pt_tuple_key, pt_tensor
+
+ return pt_tuple_key, pt_tensor
+
+
+def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model, init_key=42):
+ # Step 1: Convert pytorch tensor to numpy
+ pt_state_dict = {k: v.numpy() for k, v in pt_state_dict.items()}
+
+ # Step 2: Since the model is stateless, get random Flax params
+ random_flax_params = flax_model.init_weights(PRNGKey(init_key))
+
+ random_flax_state_dict = flatten_dict(random_flax_params)
+ flax_state_dict = {}
+
+ # Need to change some parameters name to match Flax names
+ for pt_key, pt_tensor in pt_state_dict.items():
+ renamed_pt_key = rename_key(pt_key)
+ pt_tuple_key = tuple(renamed_pt_key.split("."))
+
+ # Correctly rename weight parameters
+ flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, pt_tensor, random_flax_state_dict)
+
+ if flax_key in random_flax_state_dict:
+ if flax_tensor.shape != random_flax_state_dict[flax_key].shape:
+ raise ValueError(
+ f"PyTorch checkpoint seems to be incorrect. Weight {pt_key} was expected to be of shape "
+ f"{random_flax_state_dict[flax_key].shape}, but is {flax_tensor.shape}."
+ )
+
+ # also add unexpected weight so that warning is thrown
+ flax_state_dict[flax_key] = jnp.asarray(flax_tensor)
+
+ return unflatten_dict(flax_state_dict)
diff --git a/src/model/TextGen/diffusers/modeling_flax_utils.py b/src/model/TextGen/diffusers/modeling_flax_utils.py
new file mode 100644
index 0000000..5ef1002
--- /dev/null
+++ b/src/model/TextGen/diffusers/modeling_flax_utils.py
@@ -0,0 +1,526 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pickle import UnpicklingError
+from typing import Any, Dict, Union
+
+import jax
+import jax.numpy as jnp
+import msgpack.exceptions
+from flax.core.frozen_dict import FrozenDict, unfreeze
+from flax.serialization import from_bytes, to_bytes
+from flax.traverse_util import flatten_dict, unflatten_dict
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
+from requests import HTTPError
+
+from . import __version__, is_torch_available
+from .modeling_flax_pytorch_utils import convert_pytorch_state_dict_to_flax
+from .utils import (
+ CONFIG_NAME,
+ DIFFUSERS_CACHE,
+ FLAX_WEIGHTS_NAME,
+ HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+ WEIGHTS_NAME,
+ logging,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+class FlaxModelMixin:
+ r"""
+ Base class for all flax models.
+
+ [`FlaxModelMixin`] takes care of storing the configuration of the models and handles methods for loading,
+ downloading and saving models.
+ """
+ config_name = CONFIG_NAME
+ _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
+ _flax_internal_args = ["name", "parent", "dtype"]
+
+ @classmethod
+ def _from_config(cls, config, **kwargs):
+ """
+ All context managers that the model should be initialized under go here.
+ """
+ return cls(config, **kwargs)
+
+ def _cast_floating_to(self, params: Union[Dict, FrozenDict], dtype: jnp.dtype, mask: Any = None) -> Any:
+ """
+ Helper method to cast floating-point values of given parameter `PyTree` to given `dtype`.
+ """
+
+ # taken from https://github.com/deepmind/jmp/blob/3a8318abc3292be38582794dbf7b094e6583b192/jmp/_src/policy.py#L27
+ def conditional_cast(param):
+ if isinstance(param, jnp.ndarray) and jnp.issubdtype(param.dtype, jnp.floating):
+ param = param.astype(dtype)
+ return param
+
+ if mask is None:
+ return jax.tree_map(conditional_cast, params)
+
+ flat_params = flatten_dict(params)
+ flat_mask, _ = jax.tree_flatten(mask)
+
+ for masked, key in zip(flat_mask, flat_params.keys()):
+ if masked:
+ param = flat_params[key]
+ flat_params[key] = conditional_cast(param)
+
+ return unflatten_dict(flat_params)
+
+ def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None):
+ r"""
+ Cast the floating-point `params` to `jax.numpy.bfloat16`. This returns a new `params` tree and does not cast
+ the `params` in place.
+
+ This method can be used on TPU to explicitly convert the model parameters to bfloat16 precision to do full
+ half-precision training or to save weights in bfloat16 for inference in order to save memory and improve speed.
+
+ Arguments:
+ params (`Union[Dict, FrozenDict]`):
+ A `PyTree` of model parameters.
+ mask (`Union[Dict, FrozenDict]`):
+ A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for params
+ you want to cast, and should be `False` for those you want to skip.
+
+ Examples:
+
+ ```python
+ >>> from diffusers import FlaxUNet2DConditionModel
+
+ >>> # load model
+ >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+ >>> # By default, the model parameters will be in fp32 precision, to cast these to bfloat16 precision
+ >>> params = model.to_bf16(params)
+ >>> # If you don't want to cast certain parameters (for example layer norm bias and scale)
+ >>> # then pass the mask as follows
+ >>> from flax import traverse_util
+
+ >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+ >>> flat_params = traverse_util.flatten_dict(params)
+ >>> mask = {
+ ... path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
+ ... for path in flat_params
+ ... }
+ >>> mask = traverse_util.unflatten_dict(mask)
+ >>> params = model.to_bf16(params, mask)
+ ```"""
+ return self._cast_floating_to(params, jnp.bfloat16, mask)
+
+ def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
+ r"""
+ Cast the floating-point `params` to `jax.numpy.float32`. This method can be used to explicitly convert the
+ model parameters to fp32 precision. This returns a new `params` tree and does not cast the `params` in place.
+
+ Arguments:
+ params (`Union[Dict, FrozenDict]`):
+ A `PyTree` of model parameters.
+ mask (`Union[Dict, FrozenDict]`):
+ A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for params
+ you want to cast, and should be `False` for those you want to skip
+
+ Examples:
+
+ ```python
+ >>> from diffusers import FlaxUNet2DConditionModel
+
+ >>> # Download model and configuration from huggingface.co
+ >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+ >>> # By default, the model params will be in fp32, to illustrate the use of this method,
+ >>> # we'll first cast to fp16 and back to fp32
+ >>> params = model.to_f16(params)
+ >>> # now cast back to fp32
+ >>> params = model.to_fp32(params)
+ ```"""
+ return self._cast_floating_to(params, jnp.float32, mask)
+
+ def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None):
+ r"""
+ Cast the floating-point `params` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the
+ `params` in place.
+
+ This method can be used on GPU to explicitly convert the model parameters to float16 precision to do full
+ half-precision training or to save weights in float16 for inference in order to save memory and improve speed.
+
+ Arguments:
+ params (`Union[Dict, FrozenDict]`):
+ A `PyTree` of model parameters.
+ mask (`Union[Dict, FrozenDict]`):
+ A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for params
+ you want to cast, and should be `False` for those you want to skip
+
+ Examples:
+
+ ```python
+ >>> from diffusers import FlaxUNet2DConditionModel
+
+ >>> # load model
+ >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+ >>> # By default, the model params will be in fp32, to cast these to float16
+ >>> params = model.to_fp16(params)
+ >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
+ >>> # then pass the mask as follows
+ >>> from flax import traverse_util
+
+ >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+ >>> flat_params = traverse_util.flatten_dict(params)
+ >>> mask = {
+ ... path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
+ ... for path in flat_params
+ ... }
+ >>> mask = traverse_util.unflatten_dict(mask)
+ >>> params = model.to_fp16(params, mask)
+ ```"""
+ return self._cast_floating_to(params, jnp.float16, mask)
+
+ def init_weights(self, rng: jax.random.PRNGKey) -> Dict:
+ raise NotImplementedError(f"init_weights method has to be implemented for {self}")
+
+ @classmethod
+ def from_pretrained(
+ cls,
+ pretrained_model_name_or_path: Union[str, os.PathLike],
+ dtype: jnp.dtype = jnp.float32,
+ *model_args,
+ **kwargs,
+ ):
+ r"""
+ Instantiate a pretrained flax model from a pre-trained model configuration.
+
+ The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
+ pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+ task.
+
+ The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
+ weights are discarded.
+
+ Parameters:
+ pretrained_model_name_or_path (`str` or `os.PathLike`):
+ Can be either:
+
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids are namespaced under a user or organization name, like
+ `runwayml/stable-diffusion-v1-5`.
+ - A path to a *directory* containing model weights saved using [`~ModelMixin.save_pretrained`],
+ e.g., `./my_model_directory/`.
+ dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+ The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+ `jax.numpy.bfloat16` (on TPUs).
+
+ This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+ specified all the computation will be performed with the given `dtype`.
+
+ **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+ parameters.**
+
+ If you wish to change the dtype of the model parameters, see [`~ModelMixin.to_fp16`] and
+ [`~ModelMixin.to_bf16`].
+ model_args (sequence of positional arguments, *optional*):
+ All remaining positional arguments will be passed to the underlying model's `__init__` method.
+ cache_dir (`Union[str, os.PathLike]`, *optional*):
+ Path to a directory in which a downloaded pretrained model configuration should be cached if the
+ standard cache should not be used.
+ force_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+ cached versions if they exist.
+ resume_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+ file exists.
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+ 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+ local_files_only(`bool`, *optional*, defaults to `False`):
+ Whether or not to only look at local files (i.e., do not try to download the model).
+ revision (`str`, *optional*, defaults to `"main"`):
+ The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+ identifier allowed by git.
+ from_pt (`bool`, *optional*, defaults to `False`):
+ Load the model weights from a PyTorch checkpoint save file.
+ kwargs (remaining dictionary of keyword arguments, *optional*):
+ Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+ `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
+ automatically loaded:
+
+ - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+ underlying model's `__init__` method (we assume all relevant updates to the configuration have
+ already been done)
+ - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+ initialization function ([`~ConfigMixin.from_config`]). Each key of `kwargs` that corresponds to
+ a configuration attribute will be used to override said attribute with the supplied `kwargs`
+ value. Remaining keys that do not correspond to any configuration attribute will be passed to the
+ underlying model's `__init__` function.
+
+ Examples:
+
+ ```python
+ >>> from diffusers import FlaxUNet2DConditionModel
+
+ >>> # Download model and configuration from huggingface.co and cache.
+ >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+ >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
+ >>> model, params = FlaxUNet2DConditionModel.from_pretrained("./test/saved_model/")
+ ```"""
+ config = kwargs.pop("config", None)
+ cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+ force_download = kwargs.pop("force_download", False)
+ from_pt = kwargs.pop("from_pt", False)
+ resume_download = kwargs.pop("resume_download", False)
+ proxies = kwargs.pop("proxies", None)
+ local_files_only = kwargs.pop("local_files_only", False)
+ use_auth_token = kwargs.pop("use_auth_token", None)
+ revision = kwargs.pop("revision", None)
+ subfolder = kwargs.pop("subfolder", None)
+
+ user_agent = {
+ "diffusers": __version__,
+ "file_type": "model",
+ "framework": "flax",
+ }
+
+ # Load config if we don't provide a configuration
+ config_path = config if config is not None else pretrained_model_name_or_path
+ model, model_kwargs = cls.from_config(
+ config_path,
+ cache_dir=cache_dir,
+ return_unused_kwargs=True,
+ force_download=force_download,
+ resume_download=resume_download,
+ proxies=proxies,
+ local_files_only=local_files_only,
+ use_auth_token=use_auth_token,
+ revision=revision,
+ subfolder=subfolder,
+ # model args
+ dtype=dtype,
+ **kwargs,
+ )
+
+ # Load model
+ pretrained_path_with_subfolder = (
+ pretrained_model_name_or_path
+ if subfolder is None
+ else os.path.join(pretrained_model_name_or_path, subfolder)
+ )
+ if os.path.isdir(pretrained_path_with_subfolder):
+ if from_pt:
+ if not os.path.isfile(os.path.join(pretrained_path_with_subfolder, WEIGHTS_NAME)):
+ raise EnvironmentError(
+ f"Error no file named {WEIGHTS_NAME} found in directory {pretrained_path_with_subfolder} "
+ )
+ model_file = os.path.join(pretrained_path_with_subfolder, WEIGHTS_NAME)
+ elif os.path.isfile(os.path.join(pretrained_path_with_subfolder, FLAX_WEIGHTS_NAME)):
+ # Load from a Flax checkpoint
+ model_file = os.path.join(pretrained_path_with_subfolder, FLAX_WEIGHTS_NAME)
+ # Check if pytorch weights exist instead
+ elif os.path.isfile(os.path.join(pretrained_path_with_subfolder, WEIGHTS_NAME)):
+ raise EnvironmentError(
+ f"{WEIGHTS_NAME} file found in directory {pretrained_path_with_subfolder}. Please load the model"
+ " using `from_pt=True`."
+ )
+ else:
+ raise EnvironmentError(
+ f"Error no file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME} found in directory "
+ f"{pretrained_path_with_subfolder}."
+ )
+ else:
+ try:
+ model_file = hf_hub_download(
+ pretrained_model_name_or_path,
+ filename=FLAX_WEIGHTS_NAME if not from_pt else WEIGHTS_NAME,
+ cache_dir=cache_dir,
+ force_download=force_download,
+ proxies=proxies,
+ resume_download=resume_download,
+ local_files_only=local_files_only,
+ use_auth_token=use_auth_token,
+ user_agent=user_agent,
+ subfolder=subfolder,
+ revision=revision,
+ )
+
+ except RepositoryNotFoundError:
+ raise EnvironmentError(
+ f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
+ "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
+ "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
+ "login`."
+ )
+ except RevisionNotFoundError:
+ raise EnvironmentError(
+ f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
+ "this model name. Check the model page at "
+ f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+ )
+ except EntryNotFoundError:
+ raise EnvironmentError(
+ f"{pretrained_model_name_or_path} does not appear to have a file named {FLAX_WEIGHTS_NAME}."
+ )
+ except HTTPError as err:
+ raise EnvironmentError(
+ f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n"
+ f"{err}"
+ )
+ except ValueError:
+ raise EnvironmentError(
+ f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+ f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+ f" directory containing a file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}.\nCheckout your"
+ " internet connection or see how to run the library in offline mode at"
+ " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
+ )
+ except EnvironmentError:
+ raise EnvironmentError(
+ f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+ "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+ f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+ f"containing a file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}."
+ )
+
+ if from_pt:
+ if is_torch_available():
+ from .modeling_utils import load_state_dict
+ else:
+ raise EnvironmentError(
+ "Can't load the model in PyTorch format because PyTorch is not installed. "
+ "Please, install PyTorch or use native Flax weights."
+ )
+
+ # Step 1: Get the pytorch file
+ pytorch_model_file = load_state_dict(model_file)
+
+ # Step 2: Convert the weights
+ state = convert_pytorch_state_dict_to_flax(pytorch_model_file, model)
+ else:
+ try:
+ with open(model_file, "rb") as state_f:
+ state = from_bytes(cls, state_f.read())
+ except (UnpicklingError, msgpack.exceptions.ExtraData) as e:
+ try:
+ with open(model_file) as f:
+ if f.read().startswith("version"):
+ raise OSError(
+ "You seem to have cloned a repository without having git-lfs installed. Please"
+ " install git-lfs and run `git lfs install` followed by `git lfs pull` in the"
+ " folder you cloned."
+ )
+ else:
+ raise ValueError from e
+ except (UnicodeDecodeError, ValueError):
+ raise EnvironmentError(f"Unable to convert {model_file} to Flax deserializable object. ")
+ # make sure all arrays are stored as jnp.ndarray
+ # NOTE: This is to prevent a bug this will be fixed in Flax >= v0.3.4:
+ # https://github.com/google/flax/issues/1261
+ state = jax.tree_util.tree_map(lambda x: jax.device_put(x, jax.devices("cpu")[0]), state)
+
+ # flatten dicts
+ state = flatten_dict(state)
+
+ params_shape_tree = jax.eval_shape(model.init_weights, rng=jax.random.PRNGKey(0))
+ required_params = set(flatten_dict(unfreeze(params_shape_tree)).keys())
+
+ shape_state = flatten_dict(unfreeze(params_shape_tree))
+
+ missing_keys = required_params - set(state.keys())
+ unexpected_keys = set(state.keys()) - required_params
+
+ if missing_keys:
+ logger.warning(
+ f"The checkpoint {pretrained_model_name_or_path} is missing required keys: {missing_keys}. "
+ "Make sure to call model.init_weights to initialize the missing weights."
+ )
+ cls._missing_keys = missing_keys
+
+ for key in state.keys():
+ if key in shape_state and state[key].shape != shape_state[key].shape:
+ raise ValueError(
+ f"Trying to load the pretrained weight for {key} failed: checkpoint has shape "
+ f"{state[key].shape} which is incompatible with the model shape {shape_state[key].shape}. "
+ )
+
+ # remove unexpected keys to not be saved again
+ for unexpected_key in unexpected_keys:
+ del state[unexpected_key]
+
+ if len(unexpected_keys) > 0:
+ logger.warning(
+ f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+ f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+ f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
+ " with another architecture."
+ )
+ else:
+ logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+
+ if len(missing_keys) > 0:
+ logger.warning(
+ f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+ f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+ " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+ )
+ else:
+ logger.info(
+ f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+ f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint"
+ f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
+ " training."
+ )
+
+ return model, unflatten_dict(state)
+
+ def save_pretrained(
+ self,
+ save_directory: Union[str, os.PathLike],
+ params: Union[Dict, FrozenDict],
+ is_main_process: bool = True,
+ ):
+ """
+ Save a model and its configuration file to a directory, so that it can be re-loaded using the
+ `[`~FlaxModelMixin.from_pretrained`]` class method
+
+ Arguments:
+ save_directory (`str` or `os.PathLike`):
+ Directory to which to save. Will be created if it doesn't exist.
+ params (`Union[Dict, FrozenDict]`):
+ A `PyTree` of model parameters.
+ is_main_process (`bool`, *optional*, defaults to `True`):
+ Whether the process calling this is the main process or not. Useful when in distributed training like
+ TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
+ the main process to avoid race conditions.
+ """
+ if os.path.isfile(save_directory):
+ logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+ return
+
+ os.makedirs(save_directory, exist_ok=True)
+
+ model_to_save = self
+
+ # Attach architecture to the config
+ # Save the config
+ if is_main_process:
+ model_to_save.save_config(save_directory)
+
+ # save model
+ output_model_file = os.path.join(save_directory, FLAX_WEIGHTS_NAME)
+ with open(output_model_file, "wb") as f:
+ model_bytes = to_bytes(params)
+ f.write(model_bytes)
+
+ logger.info(f"Model weights saved in {output_model_file}")
diff --git a/src/model/TextGen/diffusers/modeling_utils.py b/src/model/TextGen/diffusers/modeling_utils.py
new file mode 100644
index 0000000..1e91ccd
--- /dev/null
+++ b/src/model/TextGen/diffusers/modeling_utils.py
@@ -0,0 +1,691 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor, device
+
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
+from requests import HTTPError
+
+from . import __version__
+from .utils import (
+ CONFIG_NAME,
+ DIFFUSERS_CACHE,
+ HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+ WEIGHTS_NAME,
+ is_accelerate_available,
+ is_torch_version,
+ logging,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_version(">=", "1.9.0"):
+ _LOW_CPU_MEM_USAGE_DEFAULT = True
+else:
+ _LOW_CPU_MEM_USAGE_DEFAULT = False
+
+
+if is_accelerate_available():
+ import accelerate
+ from accelerate.utils import set_module_tensor_to_device
+ from accelerate.utils.versions import is_torch_version
+
+
+def get_parameter_device(parameter: torch.nn.Module):
+ try:
+ return next(parameter.parameters()).device
+ except StopIteration:
+ # For torch.nn.DataParallel compatibility in PyTorch 1.5
+
+ def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+ tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+ return tuples
+
+ gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+ first_tuple = next(gen)
+ return first_tuple[1].device
+
+
+def get_parameter_dtype(parameter: torch.nn.Module):
+ try:
+ return next(parameter.parameters()).dtype
+ except StopIteration:
+ # For torch.nn.DataParallel compatibility in PyTorch 1.5
+
+ def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+ tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+ return tuples
+
+ gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+ first_tuple = next(gen)
+ return first_tuple[1].dtype
+
+
+def load_state_dict(checkpoint_file: Union[str, os.PathLike]):
+ """
+ Reads a PyTorch checkpoint file, returning properly formatted errors if they arise.
+ """
+ try:
+ return torch.load(checkpoint_file, map_location="cpu")
+ except Exception as e:
+ try:
+ with open(checkpoint_file) as f:
+ if f.read().startswith("version"):
+ raise OSError(
+ "You seem to have cloned a repository without having git-lfs installed. Please install "
+ "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
+ "you cloned."
+ )
+ else:
+ raise ValueError(
+ f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
+ "model. Make sure you have saved the model properly."
+ ) from e
+ except (UnicodeDecodeError, ValueError):
+ raise OSError(
+ f"Unable to load weights from pytorch checkpoint file for '{checkpoint_file}' "
+ f"at '{checkpoint_file}'. "
+ "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
+ )
+
+
+def _load_state_dict_into_model(model_to_load, state_dict):
+ # Convert old format to new format if needed from a PyTorch state_dict
+ # copy state_dict so _load_from_state_dict can modify it
+ state_dict = state_dict.copy()
+ error_msgs = []
+
+ # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+ # so we need to apply the function recursively.
+ def load(module: torch.nn.Module, prefix=""):
+ args = (state_dict, prefix, {}, True, [], [], error_msgs)
+ module._load_from_state_dict(*args)
+
+ for name, child in module._modules.items():
+ if child is not None:
+ load(child, prefix + name + ".")
+
+ load(model_to_load)
+
+ return error_msgs
+
+
+class ModelMixin(torch.nn.Module):
+ r"""
+ Base class for all models.
+
+ [`ModelMixin`] takes care of storing the configuration of the models and handles methods for loading, downloading
+ and saving models.
+
+ - **config_name** ([`str`]) -- A filename under which the model should be stored when calling
+ [`~modeling_utils.ModelMixin.save_pretrained`].
+ """
+ config_name = CONFIG_NAME
+ _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
+ _supports_gradient_checkpointing = False
+
+ def __init__(self):
+ super().__init__()
+
+ @property
+ def is_gradient_checkpointing(self) -> bool:
+ """
+ Whether gradient checkpointing is activated for this model or not.
+
+ Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+ activations".
+ """
+ return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
+
+ def enable_gradient_checkpointing(self):
+ """
+ Activates gradient checkpointing for the current model.
+
+ Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+ activations".
+ """
+ if not self._supports_gradient_checkpointing:
+ raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+ self.apply(partial(self._set_gradient_checkpointing, value=True))
+
+ def disable_gradient_checkpointing(self):
+ """
+ Deactivates gradient checkpointing for the current model.
+
+ Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+ activations".
+ """
+ if self._supports_gradient_checkpointing:
+ self.apply(partial(self._set_gradient_checkpointing, value=False))
+
+ def save_pretrained(
+ self,
+ save_directory: Union[str, os.PathLike],
+ is_main_process: bool = True,
+ save_function: Callable = torch.save,
+ ):
+ """
+ Save a model and its configuration file to a directory, so that it can be re-loaded using the
+ `[`~modeling_utils.ModelMixin.from_pretrained`]` class method.
+
+ Arguments:
+ save_directory (`str` or `os.PathLike`):
+ Directory to which to save. Will be created if it doesn't exist.
+ is_main_process (`bool`, *optional*, defaults to `True`):
+ Whether the process calling this is the main process or not. Useful when in distributed training like
+ TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
+ the main process to avoid race conditions.
+ save_function (`Callable`):
+ The function to use to save the state dictionary. Useful on distributed training like TPUs when one
+ need to replace `torch.save` by another method.
+ """
+ if os.path.isfile(save_directory):
+ logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+ return
+
+ os.makedirs(save_directory, exist_ok=True)
+
+ model_to_save = self
+
+ # Attach architecture to the config
+ # Save the config
+ if is_main_process:
+ model_to_save.save_config(save_directory)
+
+ # Save the model
+ state_dict = model_to_save.state_dict()
+
+ # Clean the folder from a previous save
+ for filename in os.listdir(save_directory):
+ full_filename = os.path.join(save_directory, filename)
+ # If we have a shard file that is not going to be replaced, we delete it, but only from the main process
+ # in distributed settings to avoid race conditions.
+ if filename.startswith(WEIGHTS_NAME[:-4]) and os.path.isfile(full_filename) and is_main_process:
+ os.remove(full_filename)
+
+ # Save the model
+ save_function(state_dict, os.path.join(save_directory, WEIGHTS_NAME))
+
+ logger.info(f"Model weights saved in {os.path.join(save_directory, WEIGHTS_NAME)}")
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+ r"""
+ Instantiate a pretrained pytorch model from a pre-trained model configuration.
+
+ The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+ the model, you should first set it back in training mode with `model.train()`.
+
+ The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
+ pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+ task.
+
+ The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
+ weights are discarded.
+
+ Parameters:
+ pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+ Can be either:
+
+ - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids should have an organization name, like `google/ddpm-celebahq-256`.
+ - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g.,
+ `./my_model_directory/`.
+
+ cache_dir (`Union[str, os.PathLike]`, *optional*):
+ Path to a directory in which a downloaded pretrained model configuration should be cached if the
+ standard cache should not be used.
+ torch_dtype (`str` or `torch.dtype`, *optional*):
+ Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+ will be automatically derived from the model's weights.
+ force_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+ cached versions if they exist.
+ resume_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+ file exists.
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+ 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+ output_loading_info(`bool`, *optional*, defaults to `False`):
+ Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+ local_files_only(`bool`, *optional*, defaults to `False`):
+ Whether or not to only look at local files (i.e., do not try to download the model).
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+ when running `diffusers-cli login` (stored in `~/.huggingface`).
+ revision (`str`, *optional*, defaults to `"main"`):
+ The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+ identifier allowed by git.
+ subfolder (`str`, *optional*, defaults to `""`):
+ In case the relevant files are located inside a subfolder of the model repo (either remote in
+ huggingface.co or downloaded locally), you can specify the folder name here.
+
+ mirror (`str`, *optional*):
+ Mirror source to accelerate downloads in China. If you are from China and have an accessibility
+ problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
+ Please refer to the mirror site for more information.
+ device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+ A map that specifies where each submodule should go. It doesn't need to be refined to each
+ parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+ same device.
+
+ To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+ more information about each option see [designing a device
+ map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+ low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+ Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
+ also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
+ model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
+ setting this argument to `True` will raise an error.
+
+
+
+ It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
+ models](https://huggingface.co/docs/hub/models-gated#gated-models).
+
+
+
+
+
+ Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use
+ this method in a firewalled environment.
+
+
+
+ """
+ cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+ ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+ force_download = kwargs.pop("force_download", False)
+ resume_download = kwargs.pop("resume_download", False)
+ proxies = kwargs.pop("proxies", None)
+ output_loading_info = kwargs.pop("output_loading_info", False)
+ local_files_only = kwargs.pop("local_files_only", False)
+ use_auth_token = kwargs.pop("use_auth_token", None)
+ revision = kwargs.pop("revision", None)
+ torch_dtype = kwargs.pop("torch_dtype", None)
+ subfolder = kwargs.pop("subfolder", None)
+ device_map = kwargs.pop("device_map", None)
+ low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+
+ if low_cpu_mem_usage and not is_accelerate_available():
+ low_cpu_mem_usage = False
+ logger.warn(
+ "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+ " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+ " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+ " install accelerate\n```\n."
+ )
+
+ if device_map is not None and not is_accelerate_available():
+ raise NotImplementedError(
+ "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
+ " `device_map=None`. You can install accelerate with `pip install accelerate`."
+ )
+
+ # Check if we can handle device_map and dispatching the weights
+ if device_map is not None and not is_torch_version(">=", "1.9.0"):
+ raise NotImplementedError(
+ "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+ " `device_map=None`."
+ )
+
+ if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+ raise NotImplementedError(
+ "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+ " `low_cpu_mem_usage=False`."
+ )
+
+ if low_cpu_mem_usage is False and device_map is not None:
+ raise ValueError(
+ f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
+ " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+ )
+
+ user_agent = {
+ "diffusers": __version__,
+ "file_type": "model",
+ "framework": "pytorch",
+ }
+
+ # Load config if we don't provide a configuration
+ config_path = pretrained_model_name_or_path
+
+ # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
+ # Load model
+ pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+ if os.path.isdir(pretrained_model_name_or_path):
+ if os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
+ # Load from a PyTorch checkpoint
+ model_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+ elif subfolder is not None and os.path.isfile(
+ os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_NAME)
+ ):
+ model_file = os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_NAME)
+ else:
+ raise EnvironmentError(
+ f"Error no file named {WEIGHTS_NAME} found in directory {pretrained_model_name_or_path}."
+ )
+ else:
+ try:
+ # Load from URL or cache if already cached
+ model_file = hf_hub_download(
+ pretrained_model_name_or_path,
+ filename=WEIGHTS_NAME,
+ cache_dir=cache_dir,
+ force_download=force_download,
+ proxies=proxies,
+ resume_download=resume_download,
+ local_files_only=local_files_only,
+ use_auth_token=use_auth_token,
+ user_agent=user_agent,
+ subfolder=subfolder,
+ revision=revision,
+ )
+
+ except RepositoryNotFoundError:
+ raise EnvironmentError(
+ f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
+ "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
+ "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
+ "login`."
+ )
+ except RevisionNotFoundError:
+ raise EnvironmentError(
+ f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
+ "this model name. Check the model page at "
+ f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+ )
+ except EntryNotFoundError:
+ raise EnvironmentError(
+ f"{pretrained_model_name_or_path} does not appear to have a file named {WEIGHTS_NAME}."
+ )
+ except HTTPError as err:
+ raise EnvironmentError(
+ "There was a specific connection error when trying to load"
+ f" {pretrained_model_name_or_path}:\n{err}"
+ )
+ except ValueError:
+ raise EnvironmentError(
+ f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+ f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+ f" directory containing a file named {WEIGHTS_NAME} or"
+ " \nCheckout your internet connection or see how to run the library in"
+ " offline mode at 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
+ )
+ except EnvironmentError:
+ raise EnvironmentError(
+ f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+ "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+ f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+ f"containing a file named {WEIGHTS_NAME}"
+ )
+
+ # restore default dtype
+
+ if low_cpu_mem_usage:
+ # Instantiate model with empty weights
+ with accelerate.init_empty_weights():
+ model, unused_kwargs = cls.from_config(
+ config_path,
+ cache_dir=cache_dir,
+ return_unused_kwargs=True,
+ force_download=force_download,
+ resume_download=resume_download,
+ proxies=proxies,
+ local_files_only=local_files_only,
+ use_auth_token=use_auth_token,
+ revision=revision,
+ subfolder=subfolder,
+ device_map=device_map,
+ **kwargs,
+ )
+
+ # if device_map is Non,e load the state dict on move the params from meta device to the cpu
+ if device_map is None:
+ param_device = "cpu"
+ state_dict = load_state_dict(model_file)
+ # move the parms from meta device to cpu
+ for param_name, param in state_dict.items():
+ set_module_tensor_to_device(model, param_name, param_device, value=param)
+ else: # else let accelerate handle loading and dispatching.
+ # Load weights and dispatch according to the device_map
+ # by deafult the device_map is None and the weights are loaded on the CPU
+ accelerate.load_checkpoint_and_dispatch(model, model_file, device_map)
+
+ loading_info = {
+ "missing_keys": [],
+ "unexpected_keys": [],
+ "mismatched_keys": [],
+ "error_msgs": [],
+ }
+ else:
+ model, unused_kwargs = cls.from_config(
+ config_path,
+ cache_dir=cache_dir,
+ return_unused_kwargs=True,
+ force_download=force_download,
+ resume_download=resume_download,
+ proxies=proxies,
+ local_files_only=local_files_only,
+ use_auth_token=use_auth_token,
+ revision=revision,
+ subfolder=subfolder,
+ device_map=device_map,
+ **kwargs,
+ )
+
+ state_dict = load_state_dict(model_file)
+ model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
+ model,
+ state_dict,
+ model_file,
+ pretrained_model_name_or_path,
+ ignore_mismatched_sizes=ignore_mismatched_sizes,
+ )
+
+ loading_info = {
+ "missing_keys": missing_keys,
+ "unexpected_keys": unexpected_keys,
+ "mismatched_keys": mismatched_keys,
+ "error_msgs": error_msgs,
+ }
+
+ if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
+ raise ValueError(
+ f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
+ )
+ elif torch_dtype is not None:
+ model = model.to(torch_dtype)
+
+ model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+
+ # Set model in evaluation mode to deactivate DropOut modules by default
+ model.eval()
+ if output_loading_info:
+ return model, loading_info
+
+ return model
+
+ @classmethod
+ def _load_pretrained_model(
+ cls,
+ model,
+ state_dict,
+ resolved_archive_file,
+ pretrained_model_name_or_path,
+ ignore_mismatched_sizes=False,
+ ):
+ # Retrieve missing & unexpected_keys
+ model_state_dict = model.state_dict()
+ loaded_keys = [k for k in state_dict.keys()]
+
+ expected_keys = list(model_state_dict.keys())
+
+ original_loaded_keys = loaded_keys
+
+ missing_keys = list(set(expected_keys) - set(loaded_keys))
+ unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+
+ # Make sure we are able to load base models as well as derived models (with heads)
+ model_to_load = model
+
+ def _find_mismatched_keys(
+ state_dict,
+ model_state_dict,
+ loaded_keys,
+ ignore_mismatched_sizes,
+ ):
+ mismatched_keys = []
+ if ignore_mismatched_sizes:
+ for checkpoint_key in loaded_keys:
+ model_key = checkpoint_key
+
+ if (
+ model_key in model_state_dict
+ and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+ ):
+ mismatched_keys.append(
+ (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+ )
+ del state_dict[checkpoint_key]
+ return mismatched_keys
+
+ if state_dict is not None:
+ # Whole checkpoint
+ mismatched_keys = _find_mismatched_keys(
+ state_dict,
+ model_state_dict,
+ original_loaded_keys,
+ ignore_mismatched_sizes,
+ )
+ error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
+
+ if len(error_msgs) > 0:
+ error_msg = "\n\t".join(error_msgs)
+ if "size mismatch" in error_msg:
+ error_msg += (
+ "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+ )
+ raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+
+ if len(unexpected_keys) > 0:
+ logger.warning(
+ f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+ f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+ f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+ " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+ " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+ f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+ " identical (initializing a BertForSequenceClassification model from a"
+ " BertForSequenceClassification model)."
+ )
+ else:
+ logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+ if len(missing_keys) > 0:
+ logger.warning(
+ f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+ f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+ " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+ )
+ elif len(mismatched_keys) == 0:
+ logger.info(
+ f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+ f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+ f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+ " without further training."
+ )
+ if len(mismatched_keys) > 0:
+ mismatched_warning = "\n".join(
+ [
+ f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+ for key, shape1, shape2 in mismatched_keys
+ ]
+ )
+ logger.warning(
+ f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+ f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+ f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+ " able to use it for predictions and inference."
+ )
+
+ return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
+
+ @property
+ def device(self) -> device:
+ """
+ `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+ device).
+ """
+ return get_parameter_device(self)
+
+ @property
+ def dtype(self) -> torch.dtype:
+ """
+ `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+ """
+ return get_parameter_dtype(self)
+
+ def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
+ """
+ Get number of (optionally, trainable or non-embeddings) parameters in the module.
+
+ Args:
+ only_trainable (`bool`, *optional*, defaults to `False`):
+ Whether or not to return only the number of trainable parameters
+
+ exclude_embeddings (`bool`, *optional*, defaults to `False`):
+ Whether or not to return only the number of non-embeddings parameters
+
+ Returns:
+ `int`: The number of parameters.
+ """
+
+ if exclude_embeddings:
+ embedding_param_names = [
+ f"{name}.weight"
+ for name, module_type in self.named_modules()
+ if isinstance(module_type, torch.nn.Embedding)
+ ]
+ non_embedding_parameters = [
+ parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
+ ]
+ return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
+ else:
+ return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
+
+
+def unwrap_model(model: torch.nn.Module) -> torch.nn.Module:
+ """
+ Recursively unwraps a model from potential containers (as used in distributed training).
+
+ Args:
+ model (`torch.nn.Module`): The model to unwrap.
+ """
+ # since there could be multiple levels of wrapping, unwrap recursively
+ if hasattr(model, "module"):
+ return unwrap_model(model.module)
+ else:
+ return model
diff --git a/src/model/TextGen/diffusers/models/__init__.py b/src/model/TextGen/diffusers/models/__init__.py
new file mode 100644
index 0000000..b4cf584
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import is_flax_available, is_torch_available
+
+
+if is_torch_available():
+ from .attention import Transformer2DModel
+ from .unet_1d import UNet1DModel
+ from .unet_2d import UNet2DModel
+ from .unet_2d_condition import UNet2DConditionModel
+ from .vae import AutoencoderKL, VQModel
+
+
+if is_flax_available():
+ from .unet_2d_condition_flax import FlaxUNet2DConditionModel
+ from .vae_flax import FlaxAutoencoderKL
diff --git a/src/model/TextGen/diffusers/models/attention.py b/src/model/TextGen/diffusers/models/attention.py
new file mode 100644
index 0000000..1674ff7
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/attention.py
@@ -0,0 +1,665 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..modeling_utils import ModelMixin
+from ..models.embeddings import ImagePositionalEmbeddings
+from ..utils import BaseOutput
+from ..utils.import_utils import is_xformers_available
+
+
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+ """
+ Args:
+ sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+ Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions
+ for the unnoised latent pixels.
+ """
+
+ sample: torch.FloatTensor
+
+
+if is_xformers_available():
+ import xformers
+ import xformers.ops
+else:
+ xformers = None
+
+
+class Transformer2DModel(ModelMixin, ConfigMixin):
+ """
+ Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual
+ embeddings) inputs.
+
+ When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard
+ transformer action. Finally, reshape to image.
+
+ When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional
+ embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict
+ classes of unnoised image.
+
+ Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised
+ image do not contain a prediction for the masked pixel as the unnoised image cannot be masked.
+
+ Parameters:
+ num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+ attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+ in_channels (`int`, *optional*):
+ Pass if the input is continuous. The number of channels in the input and output.
+ num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+ dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
+ cross_attention_dim (`int`, *optional*): The number of context dimensions to use.
+ sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+ Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+ `ImagePositionalEmbeddings`.
+ num_vector_embeds (`int`, *optional*):
+ Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+ Includes the class for the masked latent pixel.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+ num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+ The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+ to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+ up to but not more than steps than `num_embeds_ada_norm`.
+ attention_bias (`bool`, *optional*):
+ Configure if the TransformerBlocks' attention should contain a bias parameter.
+ """
+
+ @register_to_config
+ def __init__(
+ self,
+ num_attention_heads: int = 16,
+ attention_head_dim: int = 88,
+ in_channels: Optional[int] = None,
+ num_layers: int = 1,
+ dropout: float = 0.0,
+ norm_num_groups: int = 32,
+ cross_attention_dim: Optional[int] = None,
+ attention_bias: bool = False,
+ sample_size: Optional[int] = None,
+ num_vector_embeds: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ ):
+ super().__init__()
+ self.num_attention_heads = num_attention_heads
+ self.attention_head_dim = attention_head_dim
+ inner_dim = num_attention_heads * attention_head_dim
+
+ # 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+ # Define whether input is continuous or discrete depending on configuration
+ self.is_input_continuous = in_channels is not None
+ self.is_input_vectorized = num_vector_embeds is not None
+
+ if self.is_input_continuous and self.is_input_vectorized:
+ raise ValueError(
+ f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+ " sure that either `in_channels` or `num_vector_embeds` is None."
+ )
+ elif not self.is_input_continuous and not self.is_input_vectorized:
+ raise ValueError(
+ f"Has to define either `in_channels`: {in_channels} or `num_vector_embeds`: {num_vector_embeds}. Make"
+ " sure that either `in_channels` or `num_vector_embeds` is not None."
+ )
+
+ # 2. Define input layers
+ if self.is_input_continuous:
+ self.in_channels = in_channels
+
+ self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+ self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+ elif self.is_input_vectorized:
+ assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+ assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+
+ self.height = sample_size
+ self.width = sample_size
+ self.num_vector_embeds = num_vector_embeds
+ self.num_latent_pixels = self.height * self.width
+
+ self.latent_image_embedding = ImagePositionalEmbeddings(
+ num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+ )
+
+ # 3. Define transformers blocks
+ self.transformer_blocks = nn.ModuleList(
+ [
+ BasicTransformerBlock(
+ inner_dim,
+ num_attention_heads,
+ attention_head_dim,
+ dropout=dropout,
+ cross_attention_dim=cross_attention_dim,
+ activation_fn=activation_fn,
+ num_embeds_ada_norm=num_embeds_ada_norm,
+ attention_bias=attention_bias,
+ )
+ for d in range(num_layers)
+ ]
+ )
+
+ # 4. Define output layers
+ if self.is_input_continuous:
+ self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+ elif self.is_input_vectorized:
+ self.norm_out = nn.LayerNorm(inner_dim)
+ self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+
+ def _set_attention_slice(self, slice_size):
+ for block in self.transformer_blocks:
+ block._set_attention_slice(slice_size)
+
+ def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):
+ """
+ Args:
+ hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+ When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+ hidden_states
+ encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, *optional*):
+ Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+ self-attention.
+ timestep ( `torch.long`, *optional*):
+ Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
+ if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
+ tensor.
+ """
+ # 1. Input
+ if self.is_input_continuous:
+ batch, channel, height, weight = hidden_states.shape
+ residual = hidden_states
+ hidden_states = self.norm(hidden_states)
+ hidden_states = self.proj_in(hidden_states)
+ inner_dim = hidden_states.shape[1]
+ hidden_states = hidden_states.permute(0, 2, 3, 1).contiguous().reshape(batch, height * weight, inner_dim)
+ elif self.is_input_vectorized:
+ hidden_states = self.latent_image_embedding(hidden_states)
+
+ # 2. Blocks
+ for block in self.transformer_blocks:
+ hidden_states = block(hidden_states, context=encoder_hidden_states, timestep=timestep)
+
+ # 3. Output
+ if self.is_input_continuous:
+ hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+ hidden_states = self.proj_out(hidden_states)
+ output = hidden_states + residual
+ elif self.is_input_vectorized:
+ hidden_states = self.norm_out(hidden_states)
+ logits = self.out(hidden_states)
+ # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+ logits = logits.permute(0, 2, 1)
+
+ # log(p(x_0))
+ output = F.log_softmax(logits.double(), dim=1).float()
+
+ if not return_dict:
+ return (output,)
+
+ return Transformer2DModelOutput(sample=output)
+
+ def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+ for block in self.transformer_blocks:
+ block._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+
+class AttentionBlock(nn.Module):
+ """
+ An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
+ to the N-d case.
+ https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+ Uses three q, k, v linear layers to compute attention.
+
+ Parameters:
+ channels (`int`): The number of channels in the input and output.
+ num_head_channels (`int`, *optional*):
+ The number of channels in each head. If None, then `num_heads` = 1.
+ norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for group norm.
+ rescale_output_factor (`float`, *optional*, defaults to 1.0): The factor to rescale the output by.
+ eps (`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
+ """
+
+ def __init__(
+ self,
+ channels: int,
+ num_head_channels: Optional[int] = None,
+ norm_num_groups: int = 32,
+ rescale_output_factor: float = 1.0,
+ eps: float = 1e-5,
+ ):
+ super().__init__()
+ self.channels = channels
+
+ self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
+ self.num_head_size = num_head_channels
+ self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=norm_num_groups, eps=eps, affine=True)
+
+ # define q,k,v as linear layers
+ self.query = nn.Linear(channels, channels)
+ self.key = nn.Linear(channels, channels)
+ self.value = nn.Linear(channels, channels)
+
+ self.rescale_output_factor = rescale_output_factor
+ self.proj_attn = nn.Linear(channels, channels, 1)
+
+ def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:
+ new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)
+ # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
+ new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
+ return new_projection
+
+ def forward(self, hidden_states):
+ residual = hidden_states
+ batch, channel, height, width = hidden_states.shape
+
+ # norm
+ hidden_states = self.group_norm(hidden_states)
+
+ hidden_states = hidden_states.view(batch, channel, height * width).transpose(1, 2)
+
+ # proj to q, k, v
+ query_proj = self.query(hidden_states)
+ key_proj = self.key(hidden_states)
+ value_proj = self.value(hidden_states)
+
+ # transpose
+ query_states = self.transpose_for_scores(query_proj)
+ key_states = self.transpose_for_scores(key_proj)
+ value_states = self.transpose_for_scores(value_proj)
+
+ # get scores
+ scale = 1 / math.sqrt(math.sqrt(self.channels / self.num_heads))
+ attention_scores = torch.matmul(query_states * scale, key_states.transpose(-1, -2) * scale) # TODO: use baddmm
+ attention_probs = torch.softmax(attention_scores.float(), dim=-1).type(attention_scores.dtype)
+
+ # compute attention output
+ hidden_states = torch.matmul(attention_probs, value_states)
+
+ hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
+ new_hidden_states_shape = hidden_states.size()[:-2] + (self.channels,)
+ hidden_states = hidden_states.view(new_hidden_states_shape)
+
+ # compute next hidden_states
+ hidden_states = self.proj_attn(hidden_states)
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch, channel, height, width)
+
+ # res connect and rescale
+ hidden_states = (hidden_states + residual) / self.rescale_output_factor
+ return hidden_states
+
+
+class BasicTransformerBlock(nn.Module):
+ r"""
+ A basic Transformer block.
+
+ Parameters:
+ dim (`int`): The number of channels in the input and output.
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
+ attention_head_dim (`int`): The number of channels in each head.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+ cross_attention_dim (`int`, *optional*): The size of the context vector for cross attention.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+ num_embeds_ada_norm (:
+ obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+ attention_bias (:
+ obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ num_attention_heads: int,
+ attention_head_dim: int,
+ dropout=0.0,
+ cross_attention_dim: Optional[int] = None,
+ activation_fn: str = "geglu",
+ num_embeds_ada_norm: Optional[int] = None,
+ attention_bias: bool = False,
+ ):
+ super().__init__()
+ self.attn1 = CrossAttention(
+ query_dim=dim,
+ heads=num_attention_heads,
+ dim_head=attention_head_dim,
+ dropout=dropout,
+ bias=attention_bias,
+ ) # is a self-attention
+ self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+ self.attn2 = CrossAttention(
+ query_dim=dim,
+ cross_attention_dim=cross_attention_dim,
+ heads=num_attention_heads,
+ dim_head=attention_head_dim,
+ dropout=dropout,
+ bias=attention_bias,
+ ) # is self-attn if context is none
+
+ # layer norms
+ self.use_ada_layer_norm = num_embeds_ada_norm is not None
+ if self.use_ada_layer_norm:
+ self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+ self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+ else:
+ self.norm1 = nn.LayerNorm(dim)
+ self.norm2 = nn.LayerNorm(dim)
+ self.norm3 = nn.LayerNorm(dim)
+
+ def _set_attention_slice(self, slice_size):
+ self.attn1._slice_size = slice_size
+ self.attn2._slice_size = slice_size
+
+ def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+ if not is_xformers_available():
+ print("Here is how to install it")
+ raise ModuleNotFoundError(
+ "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+ " xformers",
+ name="xformers",
+ )
+ elif not torch.cuda.is_available():
+ raise ValueError(
+ "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
+ " available for GPU "
+ )
+ else:
+ try:
+ # Make sure we can run the memory efficient attention
+ _ = xformers.ops.memory_efficient_attention(
+ torch.randn((1, 2, 40), device="cuda"),
+ torch.randn((1, 2, 40), device="cuda"),
+ torch.randn((1, 2, 40), device="cuda"),
+ )
+ except Exception as e:
+ raise e
+ self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+ self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+
+ def forward(self, hidden_states, context=None, timestep=None):
+ # 1. Self-Attention
+ norm_hidden_states = (
+ self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
+ )
+ hidden_states = self.attn1(norm_hidden_states) + hidden_states
+
+ # 2. Cross-Attention
+ norm_hidden_states = (
+ self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+ )
+ hidden_states = self.attn2(norm_hidden_states, context=context) + hidden_states
+
+ # 3. Feed-forward
+ hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+
+ return hidden_states
+
+
+class CrossAttention(nn.Module):
+ r"""
+ A cross attention layer.
+
+ Parameters:
+ query_dim (`int`): The number of channels in the query.
+ cross_attention_dim (`int`, *optional*):
+ The number of channels in the context. If not given, defaults to `query_dim`.
+ heads (`int`, *optional*, defaults to 8): The number of heads to use for multi-head attention.
+ dim_head (`int`, *optional*, defaults to 64): The number of channels in each head.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+ bias (`bool`, *optional*, defaults to False):
+ Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+ """
+
+ def __init__(
+ self,
+ query_dim: int,
+ cross_attention_dim: Optional[int] = None,
+ heads: int = 8,
+ dim_head: int = 64,
+ dropout: float = 0.0,
+ bias=False,
+ ):
+ super().__init__()
+ inner_dim = dim_head * heads
+ cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+
+ self.scale = dim_head**-0.5
+ self.heads = heads
+ # for slice_size > 0 the attention score computation
+ # is split across the batch axis to save memory
+ # You can set slice_size with `set_attention_slice`
+ self._slice_size = None
+ self._use_memory_efficient_attention_xformers = False
+
+ self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+ self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+ self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+
+ self.to_out = nn.ModuleList([])
+ self.to_out.append(nn.Linear(inner_dim, query_dim))
+ self.to_out.append(nn.Dropout(dropout))
+
+ def reshape_heads_to_batch_dim(self, tensor):
+ batch_size, seq_len, dim = tensor.shape
+ head_size = self.heads
+ tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+ tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
+ return tensor
+
+ def reshape_batch_dim_to_heads(self, tensor):
+ batch_size, seq_len, dim = tensor.shape
+ head_size = self.heads
+ tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+ tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+ return tensor
+
+ def forward(self, hidden_states, context=None, mask=None):
+ batch_size, sequence_length, _ = hidden_states.shape
+
+ query = self.to_q(hidden_states)
+ context = context if context is not None else hidden_states
+ key = self.to_k(context)
+ value = self.to_v(context)
+
+ dim = query.shape[-1]
+
+ query = self.reshape_heads_to_batch_dim(query)
+ key = self.reshape_heads_to_batch_dim(key)
+ value = self.reshape_heads_to_batch_dim(value)
+
+ # TODO(PVP) - mask is currently never used. Remember to re-implement when used
+
+ # attention, what we cannot get enough of
+ if self._use_memory_efficient_attention_xformers:
+ hidden_states = self._memory_efficient_attention_xformers(query, key, value)
+ # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+ hidden_states = hidden_states.to(query.dtype)
+ else:
+ if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+ hidden_states = self._attention(query, key, value)
+ else:
+ hidden_states = self._sliced_attention(query, key, value, sequence_length, dim)
+
+ # linear proj
+ hidden_states = self.to_out[0](hidden_states)
+ # dropout
+ hidden_states = self.to_out[1](hidden_states)
+ return hidden_states
+
+ def _attention(self, query, key, value):
+ # TODO: use baddbmm for better performance
+ if query.device.type == "mps":
+ # Better performance on mps (~20-25%)
+ attention_scores = torch.einsum("b i d, b j d -> b i j", query, key) * self.scale
+ else:
+ attention_scores = torch.matmul(query, key.transpose(-1, -2)) * self.scale
+ attention_probs = attention_scores.softmax(dim=-1)
+ # compute attention output
+
+ if query.device.type == "mps":
+ hidden_states = torch.einsum("b i j, b j d -> b i d", attention_probs, value)
+ else:
+ hidden_states = torch.matmul(attention_probs, value)
+
+ # reshape hidden_states
+ hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+ return hidden_states
+
+ def _sliced_attention(self, query, key, value, sequence_length, dim):
+ batch_size_attention = query.shape[0]
+ hidden_states = torch.zeros(
+ (batch_size_attention, sequence_length, dim // self.heads), device=query.device, dtype=query.dtype
+ )
+ slice_size = self._slice_size if self._slice_size is not None else hidden_states.shape[0]
+ for i in range(hidden_states.shape[0] // slice_size):
+ start_idx = i * slice_size
+ end_idx = (i + 1) * slice_size
+ if query.device.type == "mps":
+ # Better performance on mps (~20-25%)
+ attn_slice = (
+ torch.einsum("b i d, b j d -> b i j", query[start_idx:end_idx], key[start_idx:end_idx])
+ * self.scale
+ )
+ else:
+ attn_slice = (
+ torch.matmul(query[start_idx:end_idx], key[start_idx:end_idx].transpose(1, 2)) * self.scale
+ ) # TODO: use baddbmm for better performance
+ attn_slice = attn_slice.softmax(dim=-1)
+ if query.device.type == "mps":
+ attn_slice = torch.einsum("b i j, b j d -> b i d", attn_slice, value[start_idx:end_idx])
+ else:
+ attn_slice = torch.matmul(attn_slice, value[start_idx:end_idx])
+
+ hidden_states[start_idx:end_idx] = attn_slice
+
+ # reshape hidden_states
+ hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+ return hidden_states
+
+ def _memory_efficient_attention_xformers(self, query, key, value):
+ hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=None)
+ hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+ return hidden_states
+
+
+class FeedForward(nn.Module):
+ r"""
+ A feed-forward layer.
+
+ Parameters:
+ dim (`int`): The number of channels in the input.
+ dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+ mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ dim_out: Optional[int] = None,
+ mult: int = 4,
+ dropout: float = 0.0,
+ activation_fn: str = "geglu",
+ ):
+ super().__init__()
+ inner_dim = int(dim * mult)
+ dim_out = dim_out if dim_out is not None else dim
+
+ if activation_fn == "geglu":
+ geglu = GEGLU(dim, inner_dim)
+ elif activation_fn == "geglu-approximate":
+ geglu = ApproximateGELU(dim, inner_dim)
+
+ self.net = nn.ModuleList([])
+ # project in
+ self.net.append(geglu)
+ # project dropout
+ self.net.append(nn.Dropout(dropout))
+ # project out
+ self.net.append(nn.Linear(inner_dim, dim_out))
+
+ def forward(self, hidden_states):
+ for module in self.net:
+ hidden_states = module(hidden_states)
+ return hidden_states
+
+
+# feedforward
+class GEGLU(nn.Module):
+ r"""
+ A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
+
+ Parameters:
+ dim_in (`int`): The number of channels in the input.
+ dim_out (`int`): The number of channels in the output.
+ """
+
+ def __init__(self, dim_in: int, dim_out: int):
+ super().__init__()
+ self.proj = nn.Linear(dim_in, dim_out * 2)
+
+ def gelu(self, gate):
+ if gate.device.type != "mps":
+ return F.gelu(gate)
+ # mps: gelu is not implemented for float16
+ return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+
+ def forward(self, hidden_states):
+ hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
+ return hidden_states * self.gelu(gate)
+
+
+class ApproximateGELU(nn.Module):
+ """
+ The approximate form of Gaussian Error Linear Unit (GELU)
+
+ For more details, see section 2: https://arxiv.org/abs/1606.08415
+ """
+
+ def __init__(self, dim_in: int, dim_out: int):
+ super().__init__()
+ self.proj = nn.Linear(dim_in, dim_out)
+
+ def forward(self, x):
+ x = self.proj(x)
+ return x * torch.sigmoid(1.702 * x)
+
+
+class AdaLayerNorm(nn.Module):
+ """
+ Norm layer modified to incorporate timestep embeddings.
+ """
+
+ def __init__(self, embedding_dim, num_embeddings):
+ super().__init__()
+ self.emb = nn.Embedding(num_embeddings, embedding_dim)
+ self.silu = nn.SiLU()
+ self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
+ self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)
+
+ def forward(self, x, timestep):
+ emb = self.linear(self.silu(self.emb(timestep)))
+ scale, shift = torch.chunk(emb, 2)
+ x = self.norm(x) * (1 + scale) + shift
+ return x
diff --git a/src/model/TextGen/diffusers/models/attention_flax.py b/src/model/TextGen/diffusers/models/attention_flax.py
new file mode 100644
index 0000000..1b86094
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/attention_flax.py
@@ -0,0 +1,269 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import flax.linen as nn
+import jax.numpy as jnp
+
+
+class FlaxAttentionBlock(nn.Module):
+ r"""
+ A Flax multi-head attention module as described in: https://arxiv.org/abs/1706.03762
+
+ Parameters:
+ query_dim (:obj:`int`):
+ Input hidden states dimension
+ heads (:obj:`int`, *optional*, defaults to 8):
+ Number of heads
+ dim_head (:obj:`int`, *optional*, defaults to 64):
+ Hidden states dimension inside each head
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+
+ """
+ query_dim: int
+ heads: int = 8
+ dim_head: int = 64
+ dropout: float = 0.0
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ inner_dim = self.dim_head * self.heads
+ self.scale = self.dim_head**-0.5
+
+ # Weights were exported with old names {to_q, to_k, to_v, to_out}
+ self.query = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_q")
+ self.key = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_k")
+ self.value = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_v")
+
+ self.proj_attn = nn.Dense(self.query_dim, dtype=self.dtype, name="to_out_0")
+
+ def reshape_heads_to_batch_dim(self, tensor):
+ batch_size, seq_len, dim = tensor.shape
+ head_size = self.heads
+ tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+ tensor = jnp.transpose(tensor, (0, 2, 1, 3))
+ tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
+ return tensor
+
+ def reshape_batch_dim_to_heads(self, tensor):
+ batch_size, seq_len, dim = tensor.shape
+ head_size = self.heads
+ tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+ tensor = jnp.transpose(tensor, (0, 2, 1, 3))
+ tensor = tensor.reshape(batch_size // head_size, seq_len, dim * head_size)
+ return tensor
+
+ def __call__(self, hidden_states, context=None, deterministic=True):
+ context = hidden_states if context is None else context
+
+ query_proj = self.query(hidden_states)
+ key_proj = self.key(context)
+ value_proj = self.value(context)
+
+ query_states = self.reshape_heads_to_batch_dim(query_proj)
+ key_states = self.reshape_heads_to_batch_dim(key_proj)
+ value_states = self.reshape_heads_to_batch_dim(value_proj)
+
+ # compute attentions
+ attention_scores = jnp.einsum("b i d, b j d->b i j", query_states, key_states)
+ attention_scores = attention_scores * self.scale
+ attention_probs = nn.softmax(attention_scores, axis=2)
+
+ # attend to values
+ hidden_states = jnp.einsum("b i j, b j d -> b i d", attention_probs, value_states)
+ hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+ hidden_states = self.proj_attn(hidden_states)
+ return hidden_states
+
+
+class FlaxBasicTransformerBlock(nn.Module):
+ r"""
+ A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
+ https://arxiv.org/abs/1706.03762
+
+
+ Parameters:
+ dim (:obj:`int`):
+ Inner hidden states dimension
+ n_heads (:obj:`int`):
+ Number of heads
+ d_head (:obj:`int`):
+ Hidden states dimension inside each head
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ dim: int
+ n_heads: int
+ d_head: int
+ dropout: float = 0.0
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ # self attention
+ self.attn1 = FlaxAttentionBlock(self.dim, self.n_heads, self.d_head, self.dropout, dtype=self.dtype)
+ # cross attention
+ self.attn2 = FlaxAttentionBlock(self.dim, self.n_heads, self.d_head, self.dropout, dtype=self.dtype)
+ self.ff = FlaxGluFeedForward(dim=self.dim, dropout=self.dropout, dtype=self.dtype)
+ self.norm1 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+ self.norm2 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+ self.norm3 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+
+ def __call__(self, hidden_states, context, deterministic=True):
+ # self attention
+ residual = hidden_states
+ hidden_states = self.attn1(self.norm1(hidden_states), deterministic=deterministic)
+ hidden_states = hidden_states + residual
+
+ # cross attention
+ residual = hidden_states
+ hidden_states = self.attn2(self.norm2(hidden_states), context, deterministic=deterministic)
+ hidden_states = hidden_states + residual
+
+ # feed forward
+ residual = hidden_states
+ hidden_states = self.ff(self.norm3(hidden_states), deterministic=deterministic)
+ hidden_states = hidden_states + residual
+
+ return hidden_states
+
+
+class FlaxTransformer2DModel(nn.Module):
+ r"""
+ A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
+ https://arxiv.org/pdf/1506.02025.pdf
+
+
+ Parameters:
+ in_channels (:obj:`int`):
+ Input number of channels
+ n_heads (:obj:`int`):
+ Number of heads
+ d_head (:obj:`int`):
+ Hidden states dimension inside each head
+ depth (:obj:`int`, *optional*, defaults to 1):
+ Number of transformers block
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ in_channels: int
+ n_heads: int
+ d_head: int
+ depth: int = 1
+ dropout: float = 0.0
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ self.norm = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+
+ inner_dim = self.n_heads * self.d_head
+ self.proj_in = nn.Conv(
+ inner_dim,
+ kernel_size=(1, 1),
+ strides=(1, 1),
+ padding="VALID",
+ dtype=self.dtype,
+ )
+
+ self.transformer_blocks = [
+ FlaxBasicTransformerBlock(inner_dim, self.n_heads, self.d_head, dropout=self.dropout, dtype=self.dtype)
+ for _ in range(self.depth)
+ ]
+
+ self.proj_out = nn.Conv(
+ inner_dim,
+ kernel_size=(1, 1),
+ strides=(1, 1),
+ padding="VALID",
+ dtype=self.dtype,
+ )
+
+ def __call__(self, hidden_states, context, deterministic=True):
+ batch, height, width, channels = hidden_states.shape
+ residual = hidden_states
+ hidden_states = self.norm(hidden_states)
+ hidden_states = self.proj_in(hidden_states)
+
+ hidden_states = hidden_states.reshape(batch, height * width, channels)
+
+ for transformer_block in self.transformer_blocks:
+ hidden_states = transformer_block(hidden_states, context, deterministic=deterministic)
+
+ hidden_states = hidden_states.reshape(batch, height, width, channels)
+
+ hidden_states = self.proj_out(hidden_states)
+ hidden_states = hidden_states + residual
+
+ return hidden_states
+
+
+class FlaxGluFeedForward(nn.Module):
+ r"""
+ Flax module that encapsulates two Linear layers separated by a gated linear unit activation from:
+ https://arxiv.org/abs/2002.05202
+
+ Parameters:
+ dim (:obj:`int`):
+ Inner hidden states dimension
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ dim: int
+ dropout: float = 0.0
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ # The second linear layer needs to be called
+ # net_2 for now to match the index of the Sequential layer
+ self.net_0 = FlaxGEGLU(self.dim, self.dropout, self.dtype)
+ self.net_2 = nn.Dense(self.dim, dtype=self.dtype)
+
+ def __call__(self, hidden_states, deterministic=True):
+ hidden_states = self.net_0(hidden_states)
+ hidden_states = self.net_2(hidden_states)
+ return hidden_states
+
+
+class FlaxGEGLU(nn.Module):
+ r"""
+ Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
+ https://arxiv.org/abs/2002.05202.
+
+ Parameters:
+ dim (:obj:`int`):
+ Input hidden states dimension
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ dim: int
+ dropout: float = 0.0
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ inner_dim = self.dim * 4
+ self.proj = nn.Dense(inner_dim * 2, dtype=self.dtype)
+
+ def __call__(self, hidden_states, deterministic=True):
+ hidden_states = self.proj(hidden_states)
+ hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=2)
+ return hidden_linear * nn.gelu(hidden_gelu)
diff --git a/src/model/TextGen/diffusers/models/embeddings.py b/src/model/TextGen/diffusers/models/embeddings.py
new file mode 100644
index 0000000..2af0984
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/embeddings.py
@@ -0,0 +1,197 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import numpy as np
+import torch
+from torch import nn
+
+
+def get_timestep_embedding(
+ timesteps: torch.Tensor,
+ embedding_dim: int,
+ flip_sin_to_cos: bool = False,
+ downscale_freq_shift: float = 1,
+ scale: float = 1,
+ max_period: int = 10000,
+):
+ """
+ This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+ :param timesteps: a 1-D Tensor of N indices, one per batch element.
+ These may be fractional.
+ :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+ embeddings. :return: an [N x dim] Tensor of positional embeddings.
+ """
+ assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+
+ half_dim = embedding_dim // 2
+ exponent = -math.log(max_period) * torch.arange(
+ start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+ )
+ exponent = exponent / (half_dim - downscale_freq_shift)
+
+ emb = torch.exp(exponent)
+ emb = timesteps[:, None].float() * emb[None, :]
+
+ # scale embeddings
+ emb = scale * emb
+
+ # concat sine and cosine embeddings
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+
+ # flip sine and cosine embeddings
+ if flip_sin_to_cos:
+ emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+
+ # zero pad
+ if embedding_dim % 2 == 1:
+ emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+ return emb
+
+
+class TimestepEmbedding(nn.Module):
+ def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
+ super().__init__()
+
+ self.linear_1 = nn.Linear(channel, time_embed_dim)
+ self.act = None
+ if act_fn == "silu":
+ self.act = nn.SiLU()
+ self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
+
+ def forward(self, sample):
+ sample = self.linear_1(sample)
+
+ if self.act is not None:
+ sample = self.act(sample)
+
+ sample = self.linear_2(sample)
+ return sample
+
+
+class Timesteps(nn.Module):
+ def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
+ super().__init__()
+ self.num_channels = num_channels
+ self.flip_sin_to_cos = flip_sin_to_cos
+ self.downscale_freq_shift = downscale_freq_shift
+
+ def forward(self, timesteps):
+ t_emb = get_timestep_embedding(
+ timesteps,
+ self.num_channels,
+ flip_sin_to_cos=self.flip_sin_to_cos,
+ downscale_freq_shift=self.downscale_freq_shift,
+ )
+ return t_emb
+
+
+class GaussianFourierProjection(nn.Module):
+ """Gaussian Fourier embeddings for noise levels."""
+
+ def __init__(
+ self, embedding_size: int = 256, scale: float = 1.0, set_W_to_weight=True, log=True, flip_sin_to_cos=False
+ ):
+ super().__init__()
+ self.weight = nn.Parameter(torch.randn(
+ embedding_size) * scale, requires_grad=False)
+ self.log = log
+ self.flip_sin_to_cos = flip_sin_to_cos
+
+ if set_W_to_weight:
+ # to delete later
+ self.W = nn.Parameter(torch.randn(embedding_size)
+ * scale, requires_grad=False)
+
+ self.weight = self.W
+
+ def forward(self, x):
+ if self.log:
+ x = torch.log(x)
+
+ x_proj = x[:, None] * self.weight[None, :] * 2 * np.pi
+
+ if self.flip_sin_to_cos:
+ out = torch.cat([torch.cos(x_proj), torch.sin(x_proj)], dim=-1)
+ else:
+ out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+ return out
+
+
+class ImagePositionalEmbeddings(nn.Module):
+ """
+ Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
+ height and width of the latent space.
+
+ For more details, see figure 10 of the dall-e paper: https://arxiv.org/abs/2102.12092
+
+ For VQ-diffusion:
+
+ Output vector embeddings are used as input for the transformer.
+
+ Note that the vector embeddings for the transformer are different than the vector embeddings from the VQVAE.
+
+ Args:
+ num_embed (`int`):
+ Number of embeddings for the latent pixels embeddings.
+ height (`int`):
+ Height of the latent image i.e. the number of height embeddings.
+ width (`int`):
+ Width of the latent image i.e. the number of width embeddings.
+ embed_dim (`int`):
+ Dimension of the produced vector embeddings. Used for the latent pixel, height, and width embeddings.
+ """
+
+ def __init__(
+ self,
+ num_embed: int,
+ height: int,
+ width: int,
+ embed_dim: int,
+ ):
+ super().__init__()
+
+ self.height = height
+ self.width = width
+ self.num_embed = num_embed
+ self.embed_dim = embed_dim
+
+ self.emb = nn.Embedding(self.num_embed, embed_dim)
+ self.height_emb = nn.Embedding(self.height, embed_dim)
+ self.width_emb = nn.Embedding(self.width, embed_dim)
+
+ def forward(self, index):
+ emb = self.emb(index)
+
+ height_emb = self.height_emb(torch.arange(
+ self.height, device=index.device).view(1, self.height))
+
+ # 1 x H x D -> 1 x H x 1 x D
+ height_emb = height_emb.unsqueeze(2)
+
+ width_emb = self.width_emb(torch.arange(
+ self.width, device=index.device).view(1, self.width))
+
+ # 1 x W x D -> 1 x 1 x W x D
+ width_emb = width_emb.unsqueeze(1)
+
+ pos_emb = height_emb + width_emb
+
+ # 1 x H x W x D -> 1 x L xD
+ pos_emb = pos_emb.view(1, self.height * self.width, -1)
+
+ emb = emb + pos_emb[:, : emb.shape[1], :]
+
+ return emb
diff --git a/src/model/TextGen/diffusers/models/embeddings_flax.py b/src/model/TextGen/diffusers/models/embeddings_flax.py
new file mode 100644
index 0000000..bf7d54b
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/embeddings_flax.py
@@ -0,0 +1,93 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import flax.linen as nn
+import jax.numpy as jnp
+
+
+def get_sinusoidal_embeddings(
+ timesteps: jnp.ndarray,
+ embedding_dim: int,
+ freq_shift: float = 1,
+ min_timescale: float = 1,
+ max_timescale: float = 1.0e4,
+ flip_sin_to_cos: bool = False,
+ scale: float = 1.0,
+) -> jnp.ndarray:
+ """Returns the positional encoding (same as Tensor2Tensor).
+ Args:
+ timesteps: a 1-D Tensor of N indices, one per batch element.
+ These may be fractional.
+ embedding_dim: The number of output channels.
+ min_timescale: The smallest time unit (should probably be 0.0).
+ max_timescale: The largest time unit.
+ Returns:
+ a Tensor of timing signals [N, num_channels]
+ """
+ assert timesteps.ndim == 1, "Timesteps should be a 1d-array"
+ assert embedding_dim % 2 == 0, f"Embedding dimension {embedding_dim} should be even"
+ num_timescales = float(embedding_dim // 2)
+ log_timescale_increment = math.log(max_timescale / min_timescale) / (num_timescales - freq_shift)
+ inv_timescales = min_timescale * jnp.exp(jnp.arange(num_timescales, dtype=jnp.float32) * -log_timescale_increment)
+ emb = jnp.expand_dims(timesteps, 1) * jnp.expand_dims(inv_timescales, 0)
+
+ # scale embeddings
+ scaled_time = scale * emb
+
+ if flip_sin_to_cos:
+ signal = jnp.concatenate([jnp.cos(scaled_time), jnp.sin(scaled_time)], axis=1)
+ else:
+ signal = jnp.concatenate([jnp.sin(scaled_time), jnp.cos(scaled_time)], axis=1)
+ signal = jnp.reshape(signal, [jnp.shape(timesteps)[0], embedding_dim])
+ return signal
+
+
+class FlaxTimestepEmbedding(nn.Module):
+ r"""
+ Time step Embedding Module. Learns embeddings for input time steps.
+
+ Args:
+ time_embed_dim (`int`, *optional*, defaults to `32`):
+ Time step embedding dimension
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ time_embed_dim: int = 32
+ dtype: jnp.dtype = jnp.float32
+
+ @nn.compact
+ def __call__(self, temb):
+ temb = nn.Dense(self.time_embed_dim, dtype=self.dtype, name="linear_1")(temb)
+ temb = nn.silu(temb)
+ temb = nn.Dense(self.time_embed_dim, dtype=self.dtype, name="linear_2")(temb)
+ return temb
+
+
+class FlaxTimesteps(nn.Module):
+ r"""
+ Wrapper Module for sinusoidal Time step Embeddings as described in https://arxiv.org/abs/2006.11239
+
+ Args:
+ dim (`int`, *optional*, defaults to `32`):
+ Time step embedding dimension
+ """
+ dim: int = 32
+ freq_shift: float = 1
+
+ @nn.compact
+ def __call__(self, timesteps):
+ return get_sinusoidal_embeddings(
+ timesteps, embedding_dim=self.dim, freq_shift=self.freq_shift, flip_sin_to_cos=True
+ )
diff --git a/src/model/TextGen/diffusers/models/resnet.py b/src/model/TextGen/diffusers/models/resnet.py
new file mode 100644
index 0000000..7bb5416
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/resnet.py
@@ -0,0 +1,531 @@
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Upsample2D(nn.Module):
+ """
+ An upsampling layer with an optional convolution.
+
+ Parameters:
+ channels: channels in the inputs and outputs.
+ use_conv: a bool determining if a convolution is applied.
+ dims: determines if the signal is 1D, 2D, or 3D. If 3D, then upsampling occurs in the inner-two dimensions.
+ """
+
+ def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
+ super().__init__()
+ self.channels = channels
+ self.out_channels = out_channels or channels
+ self.use_conv = use_conv
+ self.use_conv_transpose = use_conv_transpose
+ self.name = name
+
+ conv = None
+ if use_conv_transpose:
+ conv = nn.ConvTranspose2d(channels, self.out_channels, 4, 2, 1)
+ elif use_conv:
+ conv = nn.Conv2d(self.channels, self.out_channels, 3, padding=1)
+
+ # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+ if name == "conv":
+ self.conv = conv
+ else:
+ self.Conv2d_0 = conv
+
+ def forward(self, hidden_states, output_size=None):
+ assert hidden_states.shape[1] == self.channels
+
+ if self.use_conv_transpose:
+ return self.conv(hidden_states)
+
+ # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+ # TODO(Suraj): Remove this cast once the issue is fixed in PyTorch
+ # https://github.com/pytorch/pytorch/issues/86679
+ dtype = hidden_states.dtype
+ if dtype == torch.bfloat16:
+ hidden_states = hidden_states.to(torch.float32)
+
+ # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+ if hidden_states.shape[0] >= 64:
+ hidden_states = hidden_states.contiguous()
+
+ # if `output_size` is passed we force the interpolation output
+ # size and do not make use of `scale_factor=2`
+ if output_size is None:
+ hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+ else:
+ hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
+
+ # If the input is bfloat16, we cast back to bfloat16
+ if dtype == torch.bfloat16:
+ hidden_states = hidden_states.to(dtype)
+
+ # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+ if self.use_conv:
+ if self.name == "conv":
+ hidden_states = self.conv(hidden_states)
+ else:
+ hidden_states = self.Conv2d_0(hidden_states)
+
+ return hidden_states
+
+
+class Downsample2D(nn.Module):
+ """
+ A downsampling layer with an optional convolution.
+
+ Parameters:
+ channels: channels in the inputs and outputs.
+ use_conv: a bool determining if a convolution is applied.
+ dims: determines if the signal is 1D, 2D, or 3D. If 3D, then downsampling occurs in the inner-two dimensions.
+ """
+
+ def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
+ super().__init__()
+ self.channels = channels
+ self.out_channels = out_channels or channels
+ self.use_conv = use_conv
+ self.padding = padding
+ stride = 2
+ self.name = name
+
+ if use_conv:
+ conv = nn.Conv2d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
+ else:
+ assert self.channels == self.out_channels
+ conv = nn.AvgPool2d(kernel_size=stride, stride=stride)
+
+ # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+ if name == "conv":
+ self.Conv2d_0 = conv
+ self.conv = conv
+ elif name == "Conv2d_0":
+ self.conv = conv
+ else:
+ self.conv = conv
+
+ def forward(self, hidden_states):
+ assert hidden_states.shape[1] == self.channels
+ if self.use_conv and self.padding == 0:
+ pad = (0, 1, 0, 1)
+ hidden_states = F.pad(hidden_states, pad, mode="constant", value=0)
+
+ assert hidden_states.shape[1] == self.channels
+ hidden_states = self.conv(hidden_states)
+
+ return hidden_states
+
+
+class FirUpsample2D(nn.Module):
+ def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)):
+ super().__init__()
+ out_channels = out_channels if out_channels else channels
+ if use_conv:
+ self.Conv2d_0 = nn.Conv2d(channels, out_channels, kernel_size=3, stride=1, padding=1)
+ self.use_conv = use_conv
+ self.fir_kernel = fir_kernel
+ self.out_channels = out_channels
+
+ def _upsample_2d(self, hidden_states, weight=None, kernel=None, factor=2, gain=1):
+ """Fused `upsample_2d()` followed by `Conv2d()`.
+
+ Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
+ efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of
+ arbitrary order.
+
+ Args:
+ hidden_states: Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+ weight: Weight tensor of the shape `[filterH, filterW, inChannels,
+ outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`.
+ kernel: FIR filter of the shape `[firH, firW]` or `[firN]`
+ (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling.
+ factor: Integer upsampling factor (default: 2).
+ gain: Scaling factor for signal magnitude (default: 1.0).
+
+ Returns:
+ output: Tensor of the shape `[N, C, H * factor, W * factor]` or `[N, H * factor, W * factor, C]`, and same
+ datatype as `hidden_states`.
+ """
+
+ assert isinstance(factor, int) and factor >= 1
+
+ # Setup filter kernel.
+ if kernel is None:
+ kernel = [1] * factor
+
+ # setup kernel
+ kernel = torch.tensor(kernel, dtype=torch.float32)
+ if kernel.ndim == 1:
+ kernel = torch.outer(kernel, kernel)
+ kernel /= torch.sum(kernel)
+
+ kernel = kernel * (gain * (factor**2))
+
+ if self.use_conv:
+ convH = weight.shape[2]
+ convW = weight.shape[3]
+ inC = weight.shape[1]
+
+ pad_value = (kernel.shape[0] - factor) - (convW - 1)
+
+ stride = (factor, factor)
+ # Determine data dimensions.
+ output_shape = (
+ (hidden_states.shape[2] - 1) * factor + convH,
+ (hidden_states.shape[3] - 1) * factor + convW,
+ )
+ output_padding = (
+ output_shape[0] - (hidden_states.shape[2] - 1) * stride[0] - convH,
+ output_shape[1] - (hidden_states.shape[3] - 1) * stride[1] - convW,
+ )
+ assert output_padding[0] >= 0 and output_padding[1] >= 0
+ num_groups = hidden_states.shape[1] // inC
+
+ # Transpose weights.
+ weight = torch.reshape(weight, (num_groups, -1, inC, convH, convW))
+ weight = torch.flip(weight, dims=[3, 4]).permute(0, 2, 1, 3, 4)
+ weight = torch.reshape(weight, (num_groups * inC, -1, convH, convW))
+
+ inverse_conv = F.conv_transpose2d(
+ hidden_states, weight, stride=stride, output_padding=output_padding, padding=0
+ )
+
+ output = upfirdn2d_native(
+ inverse_conv,
+ torch.tensor(kernel, device=inverse_conv.device),
+ pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2 + 1),
+ )
+ else:
+ pad_value = kernel.shape[0] - factor
+ output = upfirdn2d_native(
+ hidden_states,
+ torch.tensor(kernel, device=hidden_states.device),
+ up=factor,
+ pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
+ )
+
+ return output
+
+ def forward(self, hidden_states):
+ if self.use_conv:
+ height = self._upsample_2d(hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel)
+ height = height + self.Conv2d_0.bias.reshape(1, -1, 1, 1)
+ else:
+ height = self._upsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
+
+ return height
+
+
+class FirDownsample2D(nn.Module):
+ def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)):
+ super().__init__()
+ out_channels = out_channels if out_channels else channels
+ if use_conv:
+ self.Conv2d_0 = nn.Conv2d(channels, out_channels, kernel_size=3, stride=1, padding=1)
+ self.fir_kernel = fir_kernel
+ self.use_conv = use_conv
+ self.out_channels = out_channels
+
+ def _downsample_2d(self, hidden_states, weight=None, kernel=None, factor=2, gain=1):
+ """Fused `Conv2d()` followed by `downsample_2d()`.
+ Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
+ efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of
+ arbitrary order.
+
+ Args:
+ hidden_states: Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+ weight:
+ Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be
+ performed by `inChannels = x.shape[0] // numGroups`.
+ kernel: FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] *
+ factor`, which corresponds to average pooling.
+ factor: Integer downsampling factor (default: 2).
+ gain: Scaling factor for signal magnitude (default: 1.0).
+
+ Returns:
+ output: Tensor of the shape `[N, C, H // factor, W // factor]` or `[N, H // factor, W // factor, C]`, and
+ same datatype as `x`.
+ """
+
+ assert isinstance(factor, int) and factor >= 1
+ if kernel is None:
+ kernel = [1] * factor
+
+ # setup kernel
+ kernel = torch.tensor(kernel, dtype=torch.float32)
+ if kernel.ndim == 1:
+ kernel = torch.outer(kernel, kernel)
+ kernel /= torch.sum(kernel)
+
+ kernel = kernel * gain
+
+ if self.use_conv:
+ _, _, convH, convW = weight.shape
+ pad_value = (kernel.shape[0] - factor) + (convW - 1)
+ stride_value = [factor, factor]
+ upfirdn_input = upfirdn2d_native(
+ hidden_states,
+ torch.tensor(kernel, device=hidden_states.device),
+ pad=((pad_value + 1) // 2, pad_value // 2),
+ )
+ output = F.conv2d(upfirdn_input, weight, stride=stride_value, padding=0)
+ else:
+ pad_value = kernel.shape[0] - factor
+ output = upfirdn2d_native(
+ hidden_states,
+ torch.tensor(kernel, device=hidden_states.device),
+ down=factor,
+ pad=((pad_value + 1) // 2, pad_value // 2),
+ )
+
+ return output
+
+ def forward(self, hidden_states):
+ if self.use_conv:
+ downsample_input = self._downsample_2d(hidden_states, weight=self.Conv2d_0.weight, kernel=self.fir_kernel)
+ hidden_states = downsample_input + self.Conv2d_0.bias.reshape(1, -1, 1, 1)
+ else:
+ hidden_states = self._downsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
+
+ return hidden_states
+
+
+class ResnetBlock2D(nn.Module):
+ def __init__(
+ self,
+ *,
+ in_channels,
+ out_channels=None,
+ conv_shortcut=False,
+ dropout=0.0,
+ temb_channels=512,
+ groups=32,
+ groups_out=None,
+ pre_norm=True,
+ eps=1e-6,
+ non_linearity="swish",
+ time_embedding_norm="default",
+ kernel=None,
+ output_scale_factor=1.0,
+ use_in_shortcut=None,
+ up=False,
+ down=False,
+ ):
+ super().__init__()
+ self.pre_norm = pre_norm
+ self.pre_norm = True
+ self.in_channels = in_channels
+ out_channels = in_channels if out_channels is None else out_channels
+ self.out_channels = out_channels
+ self.use_conv_shortcut = conv_shortcut
+ self.time_embedding_norm = time_embedding_norm
+ self.up = up
+ self.down = down
+ self.output_scale_factor = output_scale_factor
+
+ if groups_out is None:
+ groups_out = groups
+
+ self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+
+ self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+ if temb_channels is not None:
+ self.time_emb_proj = torch.nn.Linear(temb_channels, out_channels)
+ else:
+ self.time_emb_proj = None
+
+ self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+ self.dropout = torch.nn.Dropout(dropout)
+ self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+ if non_linearity == "swish":
+ self.nonlinearity = lambda x: F.silu(x)
+ elif non_linearity == "mish":
+ self.nonlinearity = Mish()
+ elif non_linearity == "silu":
+ self.nonlinearity = nn.SiLU()
+
+ self.upsample = self.downsample = None
+ if self.up:
+ if kernel == "fir":
+ fir_kernel = (1, 3, 3, 1)
+ self.upsample = lambda x: upsample_2d(x, kernel=fir_kernel)
+ elif kernel == "sde_vp":
+ self.upsample = partial(F.interpolate, scale_factor=2.0, mode="nearest")
+ else:
+ self.upsample = Upsample2D(in_channels, use_conv=False)
+ elif self.down:
+ if kernel == "fir":
+ fir_kernel = (1, 3, 3, 1)
+ self.downsample = lambda x: downsample_2d(x, kernel=fir_kernel)
+ elif kernel == "sde_vp":
+ self.downsample = partial(F.avg_pool2d, kernel_size=2, stride=2)
+ else:
+ self.downsample = Downsample2D(in_channels, use_conv=False, padding=1, name="op")
+
+ self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
+
+ self.conv_shortcut = None
+ if self.use_in_shortcut:
+ self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+ def forward(self, input_tensor, temb):
+ hidden_states = input_tensor
+
+ hidden_states = self.norm1(hidden_states)
+ hidden_states = self.nonlinearity(hidden_states)
+
+ if self.upsample is not None:
+ # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+ if hidden_states.shape[0] >= 64:
+ input_tensor = input_tensor.contiguous()
+ hidden_states = hidden_states.contiguous()
+ input_tensor = self.upsample(input_tensor)
+ hidden_states = self.upsample(hidden_states)
+ elif self.downsample is not None:
+ input_tensor = self.downsample(input_tensor)
+ hidden_states = self.downsample(hidden_states)
+
+ hidden_states = self.conv1(hidden_states)
+
+ if temb is not None:
+ temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
+ hidden_states = hidden_states + temb
+
+ hidden_states = self.norm2(hidden_states)
+ hidden_states = self.nonlinearity(hidden_states)
+
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = self.conv2(hidden_states)
+
+ if self.conv_shortcut is not None:
+ input_tensor = self.conv_shortcut(input_tensor)
+
+ output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+
+ return output_tensor
+
+
+class Mish(torch.nn.Module):
+ def forward(self, hidden_states):
+ return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states))
+
+
+def upsample_2d(hidden_states, kernel=None, factor=2, gain=1):
+ r"""Upsample2D a batch of 2D images with the given filter.
+ Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given
+ filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the specified
+ `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its shape is
+ a: multiple of the upsampling factor.
+
+ Args:
+ hidden_states: Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+ kernel: FIR filter of the shape `[firH, firW]` or `[firN]`
+ (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling.
+ factor: Integer upsampling factor (default: 2).
+ gain: Scaling factor for signal magnitude (default: 1.0).
+
+ Returns:
+ output: Tensor of the shape `[N, C, H * factor, W * factor]`
+ """
+ assert isinstance(factor, int) and factor >= 1
+ if kernel is None:
+ kernel = [1] * factor
+
+ kernel = torch.tensor(kernel, dtype=torch.float32)
+ if kernel.ndim == 1:
+ kernel = torch.outer(kernel, kernel)
+ kernel /= torch.sum(kernel)
+
+ kernel = kernel * (gain * (factor**2))
+ pad_value = kernel.shape[0] - factor
+ output = upfirdn2d_native(
+ hidden_states,
+ kernel.to(device=hidden_states.device),
+ up=factor,
+ pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
+ )
+ return output
+
+
+def downsample_2d(hidden_states, kernel=None, factor=2, gain=1):
+ r"""Downsample2D a batch of 2D images with the given filter.
+ Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and downsamples each image with the
+ given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the
+ specified `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its
+ shape is a multiple of the downsampling factor.
+
+ Args:
+ hidden_states: Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+ kernel: FIR filter of the shape `[firH, firW]` or `[firN]`
+ (separable). The default is `[1] * factor`, which corresponds to average pooling.
+ factor: Integer downsampling factor (default: 2).
+ gain: Scaling factor for signal magnitude (default: 1.0).
+
+ Returns:
+ output: Tensor of the shape `[N, C, H // factor, W // factor]`
+ """
+
+ assert isinstance(factor, int) and factor >= 1
+ if kernel is None:
+ kernel = [1] * factor
+
+ kernel = torch.tensor(kernel, dtype=torch.float32)
+ if kernel.ndim == 1:
+ kernel = torch.outer(kernel, kernel)
+ kernel /= torch.sum(kernel)
+
+ kernel = kernel * gain
+ pad_value = kernel.shape[0] - factor
+ output = upfirdn2d_native(
+ hidden_states, kernel.to(device=hidden_states.device), down=factor, pad=((pad_value + 1) // 2, pad_value // 2)
+ )
+ return output
+
+
+def upfirdn2d_native(tensor, kernel, up=1, down=1, pad=(0, 0)):
+ up_x = up_y = up
+ down_x = down_y = down
+ pad_x0 = pad_y0 = pad[0]
+ pad_x1 = pad_y1 = pad[1]
+
+ _, channel, in_h, in_w = tensor.shape
+ tensor = tensor.reshape(-1, in_h, in_w, 1)
+
+ _, in_h, in_w, minor = tensor.shape
+ kernel_h, kernel_w = kernel.shape
+
+ out = tensor.view(-1, in_h, 1, in_w, 1, minor)
+ out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
+ out = out.view(-1, in_h * up_y, in_w * up_x, minor)
+
+ out = F.pad(out, [0, 0, max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)])
+ out = out.to(tensor.device) # Move back to mps if necessary
+ out = out[
+ :,
+ max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0),
+ max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0),
+ :,
+ ]
+
+ out = out.permute(0, 3, 1, 2)
+ out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
+ w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
+ out = F.conv2d(out, w)
+ out = out.reshape(
+ -1,
+ minor,
+ in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
+ in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
+ )
+ out = out.permute(0, 2, 3, 1)
+ out = out[:, ::down_y, ::down_x, :]
+
+ out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+ out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
+
+ return out.view(-1, channel, out_h, out_w)
diff --git a/src/model/TextGen/diffusers/models/resnet_flax.py b/src/model/TextGen/diffusers/models/resnet_flax.py
new file mode 100644
index 0000000..6327803
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/resnet_flax.py
@@ -0,0 +1,124 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+
+
+class FlaxUpsample2D(nn.Module):
+ out_channels: int
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ self.conv = nn.Conv(
+ self.out_channels,
+ kernel_size=(3, 3),
+ strides=(1, 1),
+ padding=((1, 1), (1, 1)),
+ dtype=self.dtype,
+ )
+
+ def __call__(self, hidden_states):
+ batch, height, width, channels = hidden_states.shape
+ hidden_states = jax.image.resize(
+ hidden_states,
+ shape=(batch, height * 2, width * 2, channels),
+ method="nearest",
+ )
+ hidden_states = self.conv(hidden_states)
+ return hidden_states
+
+
+class FlaxDownsample2D(nn.Module):
+ out_channels: int
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ self.conv = nn.Conv(
+ self.out_channels,
+ kernel_size=(3, 3),
+ strides=(2, 2),
+ padding=((1, 1), (1, 1)), # padding="VALID",
+ dtype=self.dtype,
+ )
+
+ def __call__(self, hidden_states):
+ # pad = ((0, 0), (0, 1), (0, 1), (0, 0)) # pad height and width dim
+ # hidden_states = jnp.pad(hidden_states, pad_width=pad)
+ hidden_states = self.conv(hidden_states)
+ return hidden_states
+
+
+class FlaxResnetBlock2D(nn.Module):
+ in_channels: int
+ out_channels: int = None
+ dropout_prob: float = 0.0
+ use_nin_shortcut: bool = None
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ out_channels = self.in_channels if self.out_channels is None else self.out_channels
+
+ self.norm1 = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+ self.conv1 = nn.Conv(
+ out_channels,
+ kernel_size=(3, 3),
+ strides=(1, 1),
+ padding=((1, 1), (1, 1)),
+ dtype=self.dtype,
+ )
+
+ self.time_emb_proj = nn.Dense(out_channels, dtype=self.dtype)
+
+ self.norm2 = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+ self.dropout = nn.Dropout(self.dropout_prob)
+ self.conv2 = nn.Conv(
+ out_channels,
+ kernel_size=(3, 3),
+ strides=(1, 1),
+ padding=((1, 1), (1, 1)),
+ dtype=self.dtype,
+ )
+
+ use_nin_shortcut = self.in_channels != out_channels if self.use_nin_shortcut is None else self.use_nin_shortcut
+
+ self.conv_shortcut = None
+ if use_nin_shortcut:
+ self.conv_shortcut = nn.Conv(
+ out_channels,
+ kernel_size=(1, 1),
+ strides=(1, 1),
+ padding="VALID",
+ dtype=self.dtype,
+ )
+
+ def __call__(self, hidden_states, temb, deterministic=True):
+ residual = hidden_states
+ hidden_states = self.norm1(hidden_states)
+ hidden_states = nn.swish(hidden_states)
+ hidden_states = self.conv1(hidden_states)
+
+ temb = self.time_emb_proj(nn.swish(temb))
+ temb = jnp.expand_dims(jnp.expand_dims(temb, 1), 1)
+ hidden_states = hidden_states + temb
+
+ hidden_states = self.norm2(hidden_states)
+ hidden_states = nn.swish(hidden_states)
+ hidden_states = self.dropout(hidden_states, deterministic)
+ hidden_states = self.conv2(hidden_states)
+
+ if self.conv_shortcut is not None:
+ residual = self.conv_shortcut(residual)
+
+ return hidden_states + residual
diff --git a/src/model/TextGen/diffusers/models/unet_1d.py b/src/model/TextGen/diffusers/models/unet_1d.py
new file mode 100644
index 0000000..cc0685d
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/unet_1d.py
@@ -0,0 +1,172 @@
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..modeling_utils import ModelMixin
+from ..utils import BaseOutput
+from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from .unet_1d_blocks import get_down_block, get_mid_block, get_up_block
+
+
+@dataclass
+class UNet1DOutput(BaseOutput):
+ """
+ Args:
+ sample (`torch.FloatTensor` of shape `(batch_size, num_channels, sample_size)`):
+ Hidden states output. Output of last layer of model.
+ """
+
+ sample: torch.FloatTensor
+
+
+class UNet1DModel(ModelMixin, ConfigMixin):
+ r"""
+ UNet1DModel is a 1D UNet model that takes in a noisy sample and a timestep and returns sample shaped output.
+
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+ implements for all the model (such as downloading or saving, etc.)
+
+ Parameters:
+ sample_size (`int`, *optionl*): Default length of sample. Should be adaptable at runtime.
+ in_channels (`int`, *optional*, defaults to 2): Number of channels in the input sample.
+ out_channels (`int`, *optional*, defaults to 2): Number of channels in the output.
+ time_embedding_type (`str`, *optional*, defaults to `"fourier"`): Type of time embedding to use.
+ freq_shift (`int`, *optional*, defaults to 0): Frequency shift for fourier time embedding.
+ flip_sin_to_cos (`bool`, *optional*, defaults to :
+ obj:`False`): Whether to flip sin to cos for fourier time embedding.
+ down_block_types (`Tuple[str]`, *optional*, defaults to :
+ obj:`("DownBlock1D", "DownBlock1DNoSkip", "AttnDownBlock1D")`): Tuple of downsample block types.
+ up_block_types (`Tuple[str]`, *optional*, defaults to :
+ obj:`("UpBlock1D", "UpBlock1DNoSkip", "AttnUpBlock1D")`): Tuple of upsample block types.
+ block_out_channels (`Tuple[int]`, *optional*, defaults to :
+ obj:`(32, 32, 64)`): Tuple of block output channels.
+ """
+
+ @register_to_config
+ def __init__(
+ self,
+ sample_size: int = 65536,
+ sample_rate: Optional[int] = None,
+ in_channels: int = 2,
+ out_channels: int = 2,
+ extra_in_channels: int = 0,
+ time_embedding_type: str = "fourier",
+ freq_shift: int = 0,
+ flip_sin_to_cos: bool = True,
+ use_timestep_embedding: bool = False,
+ down_block_types: Tuple[str] = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
+ mid_block_type: str = "UNetMidBlock1D",
+ up_block_types: Tuple[str] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
+ block_out_channels: Tuple[int] = (32, 32, 64),
+ ):
+ super().__init__()
+
+ self.sample_size = sample_size
+
+ # time
+ if time_embedding_type == "fourier":
+ self.time_proj = GaussianFourierProjection(
+ embedding_size=8, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+ )
+ timestep_input_dim = 2 * block_out_channels[0]
+ elif time_embedding_type == "positional":
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+ timestep_input_dim = block_out_channels[0]
+
+ if use_timestep_embedding:
+ time_embed_dim = block_out_channels[0] * 4
+ self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+ self.down_blocks = nn.ModuleList([])
+ self.mid_block = None
+ self.up_blocks = nn.ModuleList([])
+ self.out_block = None
+
+ # down
+ output_channel = in_channels
+ for i, down_block_type in enumerate(down_block_types):
+ input_channel = output_channel
+ output_channel = block_out_channels[i]
+
+ if i == 0:
+ input_channel += extra_in_channels
+
+ down_block = get_down_block(
+ down_block_type,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ )
+ self.down_blocks.append(down_block)
+
+ # mid
+ self.mid_block = get_mid_block(
+ mid_block_type=mid_block_type,
+ mid_channels=block_out_channels[-1],
+ in_channels=block_out_channels[-1],
+ out_channels=None,
+ )
+
+ # up
+ reversed_block_out_channels = list(reversed(block_out_channels))
+ output_channel = reversed_block_out_channels[0]
+ for i, up_block_type in enumerate(up_block_types):
+ prev_output_channel = output_channel
+ output_channel = reversed_block_out_channels[i + 1] if i < len(up_block_types) - 1 else out_channels
+
+ up_block = get_up_block(
+ up_block_type,
+ in_channels=prev_output_channel,
+ out_channels=output_channel,
+ )
+ self.up_blocks.append(up_block)
+ prev_output_channel = output_channel
+
+ # TODO(PVP, Nathan) placeholder for RL application to be merged shortly
+ # Totally fine to add another layer with a if statement - no need for nn.Identity here
+
+ def forward(
+ self,
+ sample: torch.FloatTensor,
+ timestep: Union[torch.Tensor, float, int],
+ return_dict: bool = True,
+ ) -> Union[UNet1DOutput, Tuple]:
+ r"""
+ Args:
+ sample (`torch.FloatTensor`): `(batch_size, sample_size, num_channels)` noisy inputs tensor
+ timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~models.unet_1d.UNet1DOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~models.unet_1d.UNet1DOutput`] or `tuple`: [`~models.unet_1d.UNet1DOutput`] if `return_dict` is True,
+ otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+ """
+ # 1. time
+ if len(timestep.shape) == 0:
+ timestep = timestep[None]
+
+ timestep_embed = self.time_proj(timestep)[..., None]
+ timestep_embed = timestep_embed.repeat([1, 1, sample.shape[2]]).to(sample.dtype)
+
+ # 2. down
+ down_block_res_samples = ()
+ for downsample_block in self.down_blocks:
+ sample, res_samples = downsample_block(hidden_states=sample, temb=timestep_embed)
+ down_block_res_samples += res_samples
+
+ # 3. mid
+ sample = self.mid_block(sample)
+
+ # 4. up
+ for i, upsample_block in enumerate(self.up_blocks):
+ res_samples = down_block_res_samples[-1:]
+ down_block_res_samples = down_block_res_samples[:-1]
+ sample = upsample_block(sample, res_samples)
+
+ if not return_dict:
+ return (sample,)
+
+ return UNet1DOutput(sample=sample)
diff --git a/src/model/TextGen/diffusers/models/unet_1d_blocks.py b/src/model/TextGen/diffusers/models/unet_1d_blocks.py
new file mode 100644
index 0000000..9009071
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/unet_1d_blocks.py
@@ -0,0 +1,384 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+_kernels = {
+ "linear": [1 / 8, 3 / 8, 3 / 8, 1 / 8],
+ "cubic": [-0.01171875, -0.03515625, 0.11328125, 0.43359375, 0.43359375, 0.11328125, -0.03515625, -0.01171875],
+ "lanczos3": [
+ 0.003689131001010537,
+ 0.015056144446134567,
+ -0.03399861603975296,
+ -0.066637322306633,
+ 0.13550527393817902,
+ 0.44638532400131226,
+ 0.44638532400131226,
+ 0.13550527393817902,
+ -0.066637322306633,
+ -0.03399861603975296,
+ 0.015056144446134567,
+ 0.003689131001010537,
+ ],
+}
+
+
+class Downsample1d(nn.Module):
+ def __init__(self, kernel="linear", pad_mode="reflect"):
+ super().__init__()
+ self.pad_mode = pad_mode
+ kernel_1d = torch.tensor(_kernels[kernel])
+ self.pad = kernel_1d.shape[0] // 2 - 1
+ self.register_buffer("kernel", kernel_1d)
+
+ def forward(self, hidden_states):
+ hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode)
+ weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]])
+ indices = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+ weight[indices, indices] = self.kernel.to(weight)
+ return F.conv1d(hidden_states, weight, stride=2)
+
+
+class Upsample1d(nn.Module):
+ def __init__(self, kernel="linear", pad_mode="reflect"):
+ super().__init__()
+ self.pad_mode = pad_mode
+ kernel_1d = torch.tensor(_kernels[kernel]) * 2
+ self.pad = kernel_1d.shape[0] // 2 - 1
+ self.register_buffer("kernel", kernel_1d)
+
+ def forward(self, hidden_states):
+ hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode)
+ weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]])
+ indices = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+ weight[indices, indices] = self.kernel.to(weight)
+ return F.conv_transpose1d(hidden_states, weight, stride=2, padding=self.pad * 2 + 1)
+
+
+class SelfAttention1d(nn.Module):
+ def __init__(self, in_channels, n_head=1, dropout_rate=0.0):
+ super().__init__()
+ self.channels = in_channels
+ self.group_norm = nn.GroupNorm(1, num_channels=in_channels)
+ self.num_heads = n_head
+
+ self.query = nn.Linear(self.channels, self.channels)
+ self.key = nn.Linear(self.channels, self.channels)
+ self.value = nn.Linear(self.channels, self.channels)
+
+ self.proj_attn = nn.Linear(self.channels, self.channels, 1)
+
+ self.dropout = nn.Dropout(dropout_rate, inplace=True)
+
+ def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:
+ new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)
+ # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
+ new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
+ return new_projection
+
+ def forward(self, hidden_states):
+ residual = hidden_states
+ batch, channel_dim, seq = hidden_states.shape
+
+ hidden_states = self.group_norm(hidden_states)
+ hidden_states = hidden_states.transpose(1, 2)
+
+ query_proj = self.query(hidden_states)
+ key_proj = self.key(hidden_states)
+ value_proj = self.value(hidden_states)
+
+ query_states = self.transpose_for_scores(query_proj)
+ key_states = self.transpose_for_scores(key_proj)
+ value_states = self.transpose_for_scores(value_proj)
+
+ scale = 1 / math.sqrt(math.sqrt(key_states.shape[-1]))
+
+ attention_scores = torch.matmul(query_states * scale, key_states.transpose(-1, -2) * scale)
+ attention_probs = torch.softmax(attention_scores, dim=-1)
+
+ # compute attention output
+ hidden_states = torch.matmul(attention_probs, value_states)
+
+ hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
+ new_hidden_states_shape = hidden_states.size()[:-2] + (self.channels,)
+ hidden_states = hidden_states.view(new_hidden_states_shape)
+
+ # compute next hidden_states
+ hidden_states = self.proj_attn(hidden_states)
+ hidden_states = hidden_states.transpose(1, 2)
+ hidden_states = self.dropout(hidden_states)
+
+ output = hidden_states + residual
+
+ return output
+
+
+class ResConvBlock(nn.Module):
+ def __init__(self, in_channels, mid_channels, out_channels, is_last=False):
+ super().__init__()
+ self.is_last = is_last
+ self.has_conv_skip = in_channels != out_channels
+
+ if self.has_conv_skip:
+ self.conv_skip = nn.Conv1d(in_channels, out_channels, 1, bias=False)
+
+ self.conv_1 = nn.Conv1d(in_channels, mid_channels, 5, padding=2)
+ self.group_norm_1 = nn.GroupNorm(1, mid_channels)
+ self.gelu_1 = nn.GELU()
+ self.conv_2 = nn.Conv1d(mid_channels, out_channels, 5, padding=2)
+
+ if not self.is_last:
+ self.group_norm_2 = nn.GroupNorm(1, out_channels)
+ self.gelu_2 = nn.GELU()
+
+ def forward(self, hidden_states):
+ residual = self.conv_skip(hidden_states) if self.has_conv_skip else hidden_states
+
+ hidden_states = self.conv_1(hidden_states)
+ hidden_states = self.group_norm_1(hidden_states)
+ hidden_states = self.gelu_1(hidden_states)
+ hidden_states = self.conv_2(hidden_states)
+
+ if not self.is_last:
+ hidden_states = self.group_norm_2(hidden_states)
+ hidden_states = self.gelu_2(hidden_states)
+
+ output = hidden_states + residual
+ return output
+
+
+def get_down_block(down_block_type, out_channels, in_channels):
+ if down_block_type == "DownBlock1D":
+ return DownBlock1D(out_channels=out_channels, in_channels=in_channels)
+ elif down_block_type == "AttnDownBlock1D":
+ return AttnDownBlock1D(out_channels=out_channels, in_channels=in_channels)
+ elif down_block_type == "DownBlock1DNoSkip":
+ return DownBlock1DNoSkip(out_channels=out_channels, in_channels=in_channels)
+ raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block(up_block_type, in_channels, out_channels):
+ if up_block_type == "UpBlock1D":
+ return UpBlock1D(in_channels=in_channels, out_channels=out_channels)
+ elif up_block_type == "AttnUpBlock1D":
+ return AttnUpBlock1D(in_channels=in_channels, out_channels=out_channels)
+ elif up_block_type == "UpBlock1DNoSkip":
+ return UpBlock1DNoSkip(in_channels=in_channels, out_channels=out_channels)
+ raise ValueError(f"{up_block_type} does not exist.")
+
+
+def get_mid_block(mid_block_type, in_channels, mid_channels, out_channels):
+ if mid_block_type == "UNetMidBlock1D":
+ return UNetMidBlock1D(in_channels=in_channels, mid_channels=mid_channels, out_channels=out_channels)
+ raise ValueError(f"{mid_block_type} does not exist.")
+
+
+class UNetMidBlock1D(nn.Module):
+ def __init__(self, mid_channels, in_channels, out_channels=None):
+ super().__init__()
+
+ out_channels = in_channels if out_channels is None else out_channels
+
+ # there is always at least one resnet
+ self.down = Downsample1d("cubic")
+ resnets = [
+ ResConvBlock(in_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, out_channels),
+ ]
+ attentions = [
+ SelfAttention1d(mid_channels, mid_channels // 32),
+ SelfAttention1d(mid_channels, mid_channels // 32),
+ SelfAttention1d(mid_channels, mid_channels // 32),
+ SelfAttention1d(mid_channels, mid_channels // 32),
+ SelfAttention1d(mid_channels, mid_channels // 32),
+ SelfAttention1d(out_channels, out_channels // 32),
+ ]
+ self.up = Upsample1d(kernel="cubic")
+
+ self.attentions = nn.ModuleList(attentions)
+ self.resnets = nn.ModuleList(resnets)
+
+ def forward(self, hidden_states):
+ hidden_states = self.down(hidden_states)
+ for attn, resnet in zip(self.attentions, self.resnets):
+ hidden_states = resnet(hidden_states)
+ hidden_states = attn(hidden_states)
+
+ hidden_states = self.up(hidden_states)
+
+ return hidden_states
+
+
+class AttnDownBlock1D(nn.Module):
+ def __init__(self, out_channels, in_channels, mid_channels=None):
+ super().__init__()
+ mid_channels = out_channels if mid_channels is None else mid_channels
+
+ self.down = Downsample1d("cubic")
+ resnets = [
+ ResConvBlock(in_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, out_channels),
+ ]
+ attentions = [
+ SelfAttention1d(mid_channels, mid_channels // 32),
+ SelfAttention1d(mid_channels, mid_channels // 32),
+ SelfAttention1d(out_channels, out_channels // 32),
+ ]
+
+ self.attentions = nn.ModuleList(attentions)
+ self.resnets = nn.ModuleList(resnets)
+
+ def forward(self, hidden_states, temb=None):
+ hidden_states = self.down(hidden_states)
+
+ for resnet, attn in zip(self.resnets, self.attentions):
+ hidden_states = resnet(hidden_states)
+ hidden_states = attn(hidden_states)
+
+ return hidden_states, (hidden_states,)
+
+
+class DownBlock1D(nn.Module):
+ def __init__(self, out_channels, in_channels, mid_channels=None):
+ super().__init__()
+ mid_channels = out_channels if mid_channels is None else mid_channels
+
+ self.down = Downsample1d("cubic")
+ resnets = [
+ ResConvBlock(in_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, out_channels),
+ ]
+
+ self.resnets = nn.ModuleList(resnets)
+
+ def forward(self, hidden_states, temb=None):
+ hidden_states = self.down(hidden_states)
+
+ for resnet in self.resnets:
+ hidden_states = resnet(hidden_states)
+
+ return hidden_states, (hidden_states,)
+
+
+class DownBlock1DNoSkip(nn.Module):
+ def __init__(self, out_channels, in_channels, mid_channels=None):
+ super().__init__()
+ mid_channels = out_channels if mid_channels is None else mid_channels
+
+ resnets = [
+ ResConvBlock(in_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, out_channels),
+ ]
+
+ self.resnets = nn.ModuleList(resnets)
+
+ def forward(self, hidden_states, temb=None):
+ hidden_states = torch.cat([hidden_states, temb], dim=1)
+ for resnet in self.resnets:
+ hidden_states = resnet(hidden_states)
+
+ return hidden_states, (hidden_states,)
+
+
+class AttnUpBlock1D(nn.Module):
+ def __init__(self, in_channels, out_channels, mid_channels=None):
+ super().__init__()
+ mid_channels = out_channels if mid_channels is None else mid_channels
+
+ resnets = [
+ ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, out_channels),
+ ]
+ attentions = [
+ SelfAttention1d(mid_channels, mid_channels // 32),
+ SelfAttention1d(mid_channels, mid_channels // 32),
+ SelfAttention1d(out_channels, out_channels // 32),
+ ]
+
+ self.attentions = nn.ModuleList(attentions)
+ self.resnets = nn.ModuleList(resnets)
+ self.up = Upsample1d(kernel="cubic")
+
+ def forward(self, hidden_states, res_hidden_states_tuple):
+ res_hidden_states = res_hidden_states_tuple[-1]
+ hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+ for resnet, attn in zip(self.resnets, self.attentions):
+ hidden_states = resnet(hidden_states)
+ hidden_states = attn(hidden_states)
+
+ hidden_states = self.up(hidden_states)
+
+ return hidden_states
+
+
+class UpBlock1D(nn.Module):
+ def __init__(self, in_channels, out_channels, mid_channels=None):
+ super().__init__()
+ mid_channels = in_channels if mid_channels is None else mid_channels
+
+ resnets = [
+ ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, out_channels),
+ ]
+
+ self.resnets = nn.ModuleList(resnets)
+ self.up = Upsample1d(kernel="cubic")
+
+ def forward(self, hidden_states, res_hidden_states_tuple):
+ res_hidden_states = res_hidden_states_tuple[-1]
+ hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+ for resnet in self.resnets:
+ hidden_states = resnet(hidden_states)
+
+ hidden_states = self.up(hidden_states)
+
+ return hidden_states
+
+
+class UpBlock1DNoSkip(nn.Module):
+ def __init__(self, in_channels, out_channels, mid_channels=None):
+ super().__init__()
+ mid_channels = in_channels if mid_channels is None else mid_channels
+
+ resnets = [
+ ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, mid_channels),
+ ResConvBlock(mid_channels, mid_channels, out_channels, is_last=True),
+ ]
+
+ self.resnets = nn.ModuleList(resnets)
+
+ def forward(self, hidden_states, res_hidden_states_tuple):
+ res_hidden_states = res_hidden_states_tuple[-1]
+ hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+ for resnet in self.resnets:
+ hidden_states = resnet(hidden_states)
+
+ return hidden_states
diff --git a/src/model/TextGen/diffusers/models/unet_2d.py b/src/model/TextGen/diffusers/models/unet_2d.py
new file mode 100644
index 0000000..641c253
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/unet_2d.py
@@ -0,0 +1,261 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..modeling_utils import ModelMixin
+from ..utils import BaseOutput
+from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
+
+
+@dataclass
+class UNet2DOutput(BaseOutput):
+ """
+ Args:
+ sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Hidden states output. Output of last layer of model.
+ """
+
+ sample: torch.FloatTensor
+
+
+class UNet2DModel(ModelMixin, ConfigMixin):
+ r"""
+ UNet2DModel is a 2D UNet model that takes in a noisy sample and a timestep and returns sample shaped output.
+
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+ implements for all the model (such as downloading or saving, etc.)
+
+ Parameters:
+ sample_size (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
+ Input sample size.
+ in_channels (`int`, *optional*, defaults to 3): Number of channels in the input image.
+ out_channels (`int`, *optional*, defaults to 3): Number of channels in the output.
+ center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+ time_embedding_type (`str`, *optional*, defaults to `"positional"`): Type of time embedding to use.
+ freq_shift (`int`, *optional*, defaults to 0): Frequency shift for fourier time embedding.
+ flip_sin_to_cos (`bool`, *optional*, defaults to :
+ obj:`False`): Whether to flip sin to cos for fourier time embedding.
+ down_block_types (`Tuple[str]`, *optional*, defaults to :
+ obj:`("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`): Tuple of downsample block
+ types.
+ up_block_types (`Tuple[str]`, *optional*, defaults to :
+ obj:`("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`): Tuple of upsample block types.
+ block_out_channels (`Tuple[int]`, *optional*, defaults to :
+ obj:`(224, 448, 672, 896)`): Tuple of block output channels.
+ layers_per_block (`int`, *optional*, defaults to `2`): The number of layers per block.
+ mid_block_scale_factor (`float`, *optional*, defaults to `1`): The scale factor for the mid block.
+ downsample_padding (`int`, *optional*, defaults to `1`): The padding for the downsample convolution.
+ act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+ attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension.
+ norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for the normalization.
+ norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for the normalization.
+ """
+
+ @register_to_config
+ def __init__(
+ self,
+ sample_size: Optional[int] = None,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ center_input_sample: bool = False,
+ time_embedding_type: str = "positional",
+ freq_shift: int = 0,
+ flip_sin_to_cos: bool = True,
+ down_block_types: Tuple[str] = ("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
+ up_block_types: Tuple[str] = ("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
+ block_out_channels: Tuple[int] = (224, 448, 672, 896),
+ layers_per_block: int = 2,
+ mid_block_scale_factor: float = 1,
+ downsample_padding: int = 1,
+ act_fn: str = "silu",
+ attention_head_dim: int = 8,
+ norm_num_groups: int = 32,
+ norm_eps: float = 1e-5,
+ ):
+ super().__init__()
+
+ self.sample_size = sample_size
+ time_embed_dim = block_out_channels[0] * 4
+
+ # input
+ self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+
+ # time
+ if time_embedding_type == "fourier":
+ self.time_proj = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16)
+ timestep_input_dim = 2 * block_out_channels[0]
+ elif time_embedding_type == "positional":
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+ timestep_input_dim = block_out_channels[0]
+
+ self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+ self.down_blocks = nn.ModuleList([])
+ self.mid_block = None
+ self.up_blocks = nn.ModuleList([])
+
+ # down
+ output_channel = block_out_channels[0]
+ for i, down_block_type in enumerate(down_block_types):
+ input_channel = output_channel
+ output_channel = block_out_channels[i]
+ is_final_block = i == len(block_out_channels) - 1
+
+ down_block = get_down_block(
+ down_block_type,
+ num_layers=layers_per_block,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ temb_channels=time_embed_dim,
+ add_downsample=not is_final_block,
+ resnet_eps=norm_eps,
+ resnet_act_fn=act_fn,
+ resnet_groups=norm_num_groups,
+ attn_num_head_channels=attention_head_dim,
+ downsample_padding=downsample_padding,
+ )
+ self.down_blocks.append(down_block)
+
+ # mid
+ self.mid_block = UNetMidBlock2D(
+ in_channels=block_out_channels[-1],
+ temb_channels=time_embed_dim,
+ resnet_eps=norm_eps,
+ resnet_act_fn=act_fn,
+ output_scale_factor=mid_block_scale_factor,
+ resnet_time_scale_shift="default",
+ attn_num_head_channels=attention_head_dim,
+ resnet_groups=norm_num_groups,
+ )
+
+ # up
+ reversed_block_out_channels = list(reversed(block_out_channels))
+ output_channel = reversed_block_out_channels[0]
+ for i, up_block_type in enumerate(up_block_types):
+ prev_output_channel = output_channel
+ output_channel = reversed_block_out_channels[i]
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+ is_final_block = i == len(block_out_channels) - 1
+
+ up_block = get_up_block(
+ up_block_type,
+ num_layers=layers_per_block + 1,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ prev_output_channel=prev_output_channel,
+ temb_channels=time_embed_dim,
+ add_upsample=not is_final_block,
+ resnet_eps=norm_eps,
+ resnet_act_fn=act_fn,
+ resnet_groups=norm_num_groups,
+ attn_num_head_channels=attention_head_dim,
+ )
+ self.up_blocks.append(up_block)
+ prev_output_channel = output_channel
+
+ # out
+ num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
+ self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=num_groups_out, eps=norm_eps)
+ self.conv_act = nn.SiLU()
+ self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+
+ def forward(
+ self,
+ sample: torch.FloatTensor,
+ timestep: Union[torch.Tensor, float, int],
+ return_dict: bool = True,
+ ) -> Union[UNet2DOutput, Tuple]:
+ r"""
+ Args:
+ sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+ timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~models.unet_2d.UNet2DOutput`] or `tuple`: [`~models.unet_2d.UNet2DOutput`] if `return_dict` is True,
+ otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+ """
+ # 0. center input if necessary
+ if self.config.center_input_sample:
+ sample = 2 * sample - 1.0
+
+ # 1. time
+ timesteps = timestep
+ if not torch.is_tensor(timesteps):
+ timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+ elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+ timesteps = timesteps[None].to(sample.device)
+
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+ timesteps = timesteps * torch.ones(sample.shape[0], dtype=timesteps.dtype, device=timesteps.device)
+
+ t_emb = self.time_proj(timesteps)
+ emb = self.time_embedding(t_emb)
+
+ # 2. pre-process
+ skip_sample = sample
+ sample = self.conv_in(sample)
+
+ # 3. down
+ down_block_res_samples = (sample,)
+ for downsample_block in self.down_blocks:
+ if hasattr(downsample_block, "skip_conv"):
+ sample, res_samples, skip_sample = downsample_block(
+ hidden_states=sample, temb=emb, skip_sample=skip_sample
+ )
+ else:
+ sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+ down_block_res_samples += res_samples
+
+ # 4. mid
+ sample = self.mid_block(sample, emb)
+
+ # 5. up
+ skip_sample = None
+ for upsample_block in self.up_blocks:
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+ if hasattr(upsample_block, "skip_conv"):
+ sample, skip_sample = upsample_block(sample, res_samples, emb, skip_sample)
+ else:
+ sample = upsample_block(sample, res_samples, emb)
+
+ # 6. post-process
+ # make sure hidden states is in float32
+ # when running in half-precision
+ sample = self.conv_norm_out(sample.float()).type(sample.dtype)
+ sample = self.conv_act(sample)
+ sample = self.conv_out(sample)
+
+ if skip_sample is not None:
+ sample += skip_sample
+
+ if self.config.time_embedding_type == "fourier":
+ timesteps = timesteps.reshape((sample.shape[0], *([1] * len(sample.shape[1:]))))
+ sample = sample / timesteps
+
+ if not return_dict:
+ return (sample,)
+
+ return UNet2DOutput(sample=sample)
diff --git a/src/model/TextGen/diffusers/models/unet_2d_blocks.py b/src/model/TextGen/diffusers/models/unet_2d_blocks.py
new file mode 100644
index 0000000..770043f
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/unet_2d_blocks.py
@@ -0,0 +1,1597 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+from torch import nn
+
+from .attention import AttentionBlock, Transformer2DModel
+from .resnet import Downsample2D, FirDownsample2D, FirUpsample2D, ResnetBlock2D, Upsample2D
+
+
+def get_down_block(
+ down_block_type,
+ num_layers,
+ in_channels,
+ out_channels,
+ temb_channels,
+ add_downsample,
+ resnet_eps,
+ resnet_act_fn,
+ attn_num_head_channels,
+ resnet_groups=None,
+ cross_attention_dim=None,
+ downsample_padding=None,
+):
+ down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+ if down_block_type == "DownBlock2D":
+ return DownBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ add_downsample=add_downsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ downsample_padding=downsample_padding,
+ )
+ elif down_block_type == "AttnDownBlock2D":
+ return AttnDownBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ add_downsample=add_downsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ downsample_padding=downsample_padding,
+ attn_num_head_channels=attn_num_head_channels,
+ )
+ elif down_block_type == "CrossAttnDownBlock2D":
+ if cross_attention_dim is None:
+ raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
+ return CrossAttnDownBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ add_downsample=add_downsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ downsample_padding=downsample_padding,
+ cross_attention_dim=cross_attention_dim,
+ attn_num_head_channels=attn_num_head_channels,
+ )
+ elif down_block_type == "SkipDownBlock2D":
+ return SkipDownBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ add_downsample=add_downsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ downsample_padding=downsample_padding,
+ )
+ elif down_block_type == "AttnSkipDownBlock2D":
+ return AttnSkipDownBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ add_downsample=add_downsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ downsample_padding=downsample_padding,
+ attn_num_head_channels=attn_num_head_channels,
+ )
+ elif down_block_type == "DownEncoderBlock2D":
+ return DownEncoderBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ add_downsample=add_downsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ downsample_padding=downsample_padding,
+ )
+ elif down_block_type == "AttnDownEncoderBlock2D":
+ return AttnDownEncoderBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ add_downsample=add_downsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ downsample_padding=downsample_padding,
+ attn_num_head_channels=attn_num_head_channels,
+ )
+ raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block(
+ up_block_type,
+ num_layers,
+ in_channels,
+ out_channels,
+ prev_output_channel,
+ temb_channels,
+ add_upsample,
+ resnet_eps,
+ resnet_act_fn,
+ attn_num_head_channels,
+ resnet_groups=None,
+ cross_attention_dim=None,
+):
+ up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+ if up_block_type == "UpBlock2D":
+ return UpBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ prev_output_channel=prev_output_channel,
+ temb_channels=temb_channels,
+ add_upsample=add_upsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ )
+ elif up_block_type == "CrossAttnUpBlock2D":
+ if cross_attention_dim is None:
+ raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
+ return CrossAttnUpBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ prev_output_channel=prev_output_channel,
+ temb_channels=temb_channels,
+ add_upsample=add_upsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ cross_attention_dim=cross_attention_dim,
+ attn_num_head_channels=attn_num_head_channels,
+ )
+ elif up_block_type == "AttnUpBlock2D":
+ return AttnUpBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ prev_output_channel=prev_output_channel,
+ temb_channels=temb_channels,
+ add_upsample=add_upsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ attn_num_head_channels=attn_num_head_channels,
+ )
+ elif up_block_type == "SkipUpBlock2D":
+ return SkipUpBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ prev_output_channel=prev_output_channel,
+ temb_channels=temb_channels,
+ add_upsample=add_upsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ )
+ elif up_block_type == "AttnSkipUpBlock2D":
+ return AttnSkipUpBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ prev_output_channel=prev_output_channel,
+ temb_channels=temb_channels,
+ add_upsample=add_upsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ attn_num_head_channels=attn_num_head_channels,
+ )
+ elif up_block_type == "UpDecoderBlock2D":
+ return UpDecoderBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ add_upsample=add_upsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ )
+ elif up_block_type == "AttnUpDecoderBlock2D":
+ return AttnUpDecoderBlock2D(
+ num_layers=num_layers,
+ in_channels=in_channels,
+ out_channels=out_channels,
+ add_upsample=add_upsample,
+ resnet_eps=resnet_eps,
+ resnet_act_fn=resnet_act_fn,
+ resnet_groups=resnet_groups,
+ attn_num_head_channels=attn_num_head_channels,
+ )
+ raise ValueError(f"{up_block_type} does not exist.")
+
+
+class UNetMidBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels=1,
+ attention_type="default",
+ output_scale_factor=1.0,
+ **kwargs,
+ ):
+ super().__init__()
+
+ self.attention_type = attention_type
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+ # there is always at least one resnet
+ resnets = [
+ ResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=in_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ ]
+ attentions = []
+
+ for _ in range(num_layers):
+ attentions.append(
+ AttentionBlock(
+ in_channels,
+ num_head_channels=attn_num_head_channels,
+ rescale_output_factor=output_scale_factor,
+ eps=resnet_eps,
+ norm_num_groups=resnet_groups,
+ )
+ )
+ resnets.append(
+ ResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=in_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+
+ self.attentions = nn.ModuleList(attentions)
+ self.resnets = nn.ModuleList(resnets)
+
+ def forward(self, hidden_states, temb=None, encoder_states=None):
+ hidden_states = self.resnets[0](hidden_states, temb)
+ for attn, resnet in zip(self.attentions, self.resnets[1:]):
+ if self.attention_type == "default":
+ hidden_states = attn(hidden_states)
+ else:
+ hidden_states = attn(hidden_states, encoder_states)
+ hidden_states = resnet(hidden_states, temb)
+
+ return hidden_states
+
+
+class UNetMidBlock2DCrossAttn(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels=1,
+ attention_type="default",
+ output_scale_factor=1.0,
+ cross_attention_dim=1280,
+ **kwargs,
+ ):
+ super().__init__()
+
+ self.attention_type = attention_type
+ self.attn_num_head_channels = attn_num_head_channels
+ resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+ # there is always at least one resnet
+ resnets = [
+ ResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=in_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ ]
+ attentions = []
+
+ for _ in range(num_layers):
+ attentions.append(
+ Transformer2DModel(
+ attn_num_head_channels,
+ in_channels // attn_num_head_channels,
+ in_channels=in_channels,
+ num_layers=1,
+ cross_attention_dim=cross_attention_dim,
+ norm_num_groups=resnet_groups,
+ )
+ )
+ resnets.append(
+ ResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=in_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+
+ self.attentions = nn.ModuleList(attentions)
+ self.resnets = nn.ModuleList(resnets)
+
+ def set_attention_slice(self, slice_size):
+ if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+ raise ValueError(
+ f"Make sure slice_size {slice_size} is a divisor of "
+ f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+ )
+ if slice_size is not None and slice_size > self.attn_num_head_channels:
+ raise ValueError(
+ f"Chunk_size {slice_size} has to be smaller or equal to "
+ f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+ )
+
+ for attn in self.attentions:
+ attn._set_attention_slice(slice_size)
+
+ def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+ for attn in self.attentions:
+ attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+ def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+ hidden_states = self.resnets[0](hidden_states, temb)
+ for attn, resnet in zip(self.attentions, self.resnets[1:]):
+ hidden_states = attn(hidden_states, encoder_hidden_states).sample
+ hidden_states = resnet(hidden_states, temb)
+
+ return hidden_states
+
+
+class AttnDownBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels=1,
+ attention_type="default",
+ output_scale_factor=1.0,
+ downsample_padding=1,
+ add_downsample=True,
+ ):
+ super().__init__()
+ resnets = []
+ attentions = []
+
+ self.attention_type = attention_type
+
+ for i in range(num_layers):
+ in_channels = in_channels if i == 0 else out_channels
+ resnets.append(
+ ResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+ attentions.append(
+ AttentionBlock(
+ out_channels,
+ num_head_channels=attn_num_head_channels,
+ rescale_output_factor=output_scale_factor,
+ eps=resnet_eps,
+ norm_num_groups=resnet_groups,
+ )
+ )
+
+ self.attentions = nn.ModuleList(attentions)
+ self.resnets = nn.ModuleList(resnets)
+
+ if add_downsample:
+ self.downsamplers = nn.ModuleList(
+ [
+ Downsample2D(
+ out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+ )
+ ]
+ )
+ else:
+ self.downsamplers = None
+
+ def forward(self, hidden_states, temb=None):
+ output_states = ()
+
+ for resnet, attn in zip(self.resnets, self.attentions):
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(hidden_states)
+ output_states += (hidden_states,)
+
+ if self.downsamplers is not None:
+ for downsampler in self.downsamplers:
+ hidden_states = downsampler(hidden_states)
+
+ output_states += (hidden_states,)
+
+ return hidden_states, output_states
+
+
+class CrossAttnDownBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels=1,
+ cross_attention_dim=1280,
+ attention_type="default",
+ output_scale_factor=1.0,
+ downsample_padding=1,
+ add_downsample=True,
+ ):
+ super().__init__()
+ resnets = []
+ attentions = []
+
+ self.attention_type = attention_type
+ self.attn_num_head_channels = attn_num_head_channels
+
+ for i in range(num_layers):
+ in_channels = in_channels if i == 0 else out_channels
+ resnets.append(
+ ResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+ attentions.append(
+ Transformer2DModel(
+ attn_num_head_channels,
+ out_channels // attn_num_head_channels,
+ in_channels=out_channels,
+ num_layers=1,
+ cross_attention_dim=cross_attention_dim,
+ norm_num_groups=resnet_groups,
+ )
+ )
+ self.attentions = nn.ModuleList(attentions)
+ self.resnets = nn.ModuleList(resnets)
+
+ if add_downsample:
+ self.downsamplers = nn.ModuleList(
+ [
+ Downsample2D(
+ out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+ )
+ ]
+ )
+ else:
+ self.downsamplers = None
+
+ self.gradient_checkpointing = False
+
+ def set_attention_slice(self, slice_size):
+ if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+ raise ValueError(
+ f"Make sure slice_size {slice_size} is a divisor of "
+ f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+ )
+ if slice_size is not None and slice_size > self.attn_num_head_channels:
+ raise ValueError(
+ f"Chunk_size {slice_size} has to be smaller or equal to "
+ f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+ )
+
+ for attn in self.attentions:
+ attn._set_attention_slice(slice_size)
+
+ def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+ for attn in self.attentions:
+ attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+ def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+ output_states = ()
+
+ for resnet, attn in zip(self.resnets, self.attentions):
+ if self.training and self.gradient_checkpointing:
+
+ def create_custom_forward(module, return_dict=None):
+ def custom_forward(*inputs):
+ if return_dict is not None:
+ return module(*inputs, return_dict=return_dict)
+ else:
+ return module(*inputs)
+
+ return custom_forward
+
+ hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+ )[0]
+ else:
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+
+ output_states += (hidden_states,)
+
+ if self.downsamplers is not None:
+ for downsampler in self.downsamplers:
+ hidden_states = downsampler(hidden_states)
+
+ output_states += (hidden_states,)
+
+ return hidden_states, output_states
+
+
+class DownBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor=1.0,
+ add_downsample=True,
+ downsample_padding=1,
+ ):
+ super().__init__()
+ resnets = []
+
+ for i in range(num_layers):
+ in_channels = in_channels if i == 0 else out_channels
+ resnets.append(
+ ResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+
+ self.resnets = nn.ModuleList(resnets)
+
+ if add_downsample:
+ self.downsamplers = nn.ModuleList(
+ [
+ Downsample2D(
+ out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+ )
+ ]
+ )
+ else:
+ self.downsamplers = None
+
+ self.gradient_checkpointing = False
+
+ def forward(self, hidden_states, temb=None):
+ output_states = ()
+
+ for resnet in self.resnets:
+ if self.training and self.gradient_checkpointing:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs)
+
+ return custom_forward
+
+ hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ else:
+ hidden_states = resnet(hidden_states, temb)
+
+ output_states += (hidden_states,)
+
+ if self.downsamplers is not None:
+ for downsampler in self.downsamplers:
+ hidden_states = downsampler(hidden_states)
+
+ output_states += (hidden_states,)
+
+ return hidden_states, output_states
+
+
+class DownEncoderBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor=1.0,
+ add_downsample=True,
+ downsample_padding=1,
+ ):
+ super().__init__()
+ resnets = []
+
+ for i in range(num_layers):
+ in_channels = in_channels if i == 0 else out_channels
+ resnets.append(
+ ResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=None,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+
+ self.resnets = nn.ModuleList(resnets)
+
+ if add_downsample:
+ self.downsamplers = nn.ModuleList(
+ [
+ Downsample2D(
+ out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+ )
+ ]
+ )
+ else:
+ self.downsamplers = None
+
+ def forward(self, hidden_states):
+ for resnet in self.resnets:
+ hidden_states = resnet(hidden_states, temb=None)
+
+ if self.downsamplers is not None:
+ for downsampler in self.downsamplers:
+ hidden_states = downsampler(hidden_states)
+
+ return hidden_states
+
+
+class AttnDownEncoderBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels=1,
+ output_scale_factor=1.0,
+ add_downsample=True,
+ downsample_padding=1,
+ ):
+ super().__init__()
+ resnets = []
+ attentions = []
+
+ for i in range(num_layers):
+ in_channels = in_channels if i == 0 else out_channels
+ resnets.append(
+ ResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=None,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+ attentions.append(
+ AttentionBlock(
+ out_channels,
+ num_head_channels=attn_num_head_channels,
+ rescale_output_factor=output_scale_factor,
+ eps=resnet_eps,
+ norm_num_groups=resnet_groups,
+ )
+ )
+
+ self.attentions = nn.ModuleList(attentions)
+ self.resnets = nn.ModuleList(resnets)
+
+ if add_downsample:
+ self.downsamplers = nn.ModuleList(
+ [
+ Downsample2D(
+ out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+ )
+ ]
+ )
+ else:
+ self.downsamplers = None
+
+ def forward(self, hidden_states):
+ for resnet, attn in zip(self.resnets, self.attentions):
+ hidden_states = resnet(hidden_states, temb=None)
+ hidden_states = attn(hidden_states)
+
+ if self.downsamplers is not None:
+ for downsampler in self.downsamplers:
+ hidden_states = downsampler(hidden_states)
+
+ return hidden_states
+
+
+class AttnSkipDownBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels=1,
+ attention_type="default",
+ output_scale_factor=np.sqrt(2.0),
+ downsample_padding=1,
+ add_downsample=True,
+ ):
+ super().__init__()
+ self.attentions = nn.ModuleList([])
+ self.resnets = nn.ModuleList([])
+
+ self.attention_type = attention_type
+
+ for i in range(num_layers):
+ in_channels = in_channels if i == 0 else out_channels
+ self.resnets.append(
+ ResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=min(in_channels // 4, 32),
+ groups_out=min(out_channels // 4, 32),
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+ self.attentions.append(
+ AttentionBlock(
+ out_channels,
+ num_head_channels=attn_num_head_channels,
+ rescale_output_factor=output_scale_factor,
+ eps=resnet_eps,
+ )
+ )
+
+ if add_downsample:
+ self.resnet_down = ResnetBlock2D(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=min(out_channels // 4, 32),
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ use_in_shortcut=True,
+ down=True,
+ kernel="fir",
+ )
+ self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
+ self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+ else:
+ self.resnet_down = None
+ self.downsamplers = None
+ self.skip_conv = None
+
+ def forward(self, hidden_states, temb=None, skip_sample=None):
+ output_states = ()
+
+ for resnet, attn in zip(self.resnets, self.attentions):
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(hidden_states)
+ output_states += (hidden_states,)
+
+ if self.downsamplers is not None:
+ hidden_states = self.resnet_down(hidden_states, temb)
+ for downsampler in self.downsamplers:
+ skip_sample = downsampler(skip_sample)
+
+ hidden_states = self.skip_conv(skip_sample) + hidden_states
+
+ output_states += (hidden_states,)
+
+ return hidden_states, output_states, skip_sample
+
+
+class SkipDownBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_pre_norm: bool = True,
+ output_scale_factor=np.sqrt(2.0),
+ add_downsample=True,
+ downsample_padding=1,
+ ):
+ super().__init__()
+ self.resnets = nn.ModuleList([])
+
+ for i in range(num_layers):
+ in_channels = in_channels if i == 0 else out_channels
+ self.resnets.append(
+ ResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=min(in_channels // 4, 32),
+ groups_out=min(out_channels // 4, 32),
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+
+ if add_downsample:
+ self.resnet_down = ResnetBlock2D(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=min(out_channels // 4, 32),
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ use_in_shortcut=True,
+ down=True,
+ kernel="fir",
+ )
+ self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
+ self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+ else:
+ self.resnet_down = None
+ self.downsamplers = None
+ self.skip_conv = None
+
+ def forward(self, hidden_states, temb=None, skip_sample=None):
+ output_states = ()
+
+ for resnet in self.resnets:
+ hidden_states = resnet(hidden_states, temb)
+ output_states += (hidden_states,)
+
+ if self.downsamplers is not None:
+ hidden_states = self.resnet_down(hidden_states, temb)
+ for downsampler in self.downsamplers:
+ skip_sample = downsampler(skip_sample)
+
+ hidden_states = self.skip_conv(skip_sample) + hidden_states
+
+ output_states += (hidden_states,)
+
+ return hidden_states, output_states, skip_sample
+
+
+class AttnUpBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ prev_output_channel: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attention_type="default",
+ attn_num_head_channels=1,
+ output_scale_factor=1.0,
+ add_upsample=True,
+ ):
+ super().__init__()
+ resnets = []
+ attentions = []
+
+ self.attention_type = attention_type
+
+ for i in range(num_layers):
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+ resnets.append(
+ ResnetBlock2D(
+ in_channels=resnet_in_channels + res_skip_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+ attentions.append(
+ AttentionBlock(
+ out_channels,
+ num_head_channels=attn_num_head_channels,
+ rescale_output_factor=output_scale_factor,
+ eps=resnet_eps,
+ norm_num_groups=resnet_groups,
+ )
+ )
+
+ self.attentions = nn.ModuleList(attentions)
+ self.resnets = nn.ModuleList(resnets)
+
+ if add_upsample:
+ self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+ else:
+ self.upsamplers = None
+
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
+ for resnet, attn in zip(self.resnets, self.attentions):
+ # pop res hidden states
+ res_hidden_states = res_hidden_states_tuple[-1]
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+ hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(hidden_states)
+
+ if self.upsamplers is not None:
+ for upsampler in self.upsamplers:
+ hidden_states = upsampler(hidden_states)
+
+ return hidden_states
+
+
+class CrossAttnUpBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ prev_output_channel: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels=1,
+ cross_attention_dim=1280,
+ attention_type="default",
+ output_scale_factor=1.0,
+ add_upsample=True,
+ ):
+ super().__init__()
+ resnets = []
+ attentions = []
+
+ self.attention_type = attention_type
+ self.attn_num_head_channels = attn_num_head_channels
+
+ for i in range(num_layers):
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+ resnets.append(
+ ResnetBlock2D(
+ in_channels=resnet_in_channels + res_skip_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+ attentions.append(
+ Transformer2DModel(
+ attn_num_head_channels,
+ out_channels // attn_num_head_channels,
+ in_channels=out_channels,
+ num_layers=1,
+ cross_attention_dim=cross_attention_dim,
+ norm_num_groups=resnet_groups,
+ )
+ )
+ self.attentions = nn.ModuleList(attentions)
+ self.resnets = nn.ModuleList(resnets)
+
+ if add_upsample:
+ self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+ else:
+ self.upsamplers = None
+
+ self.gradient_checkpointing = False
+
+ def set_attention_slice(self, slice_size):
+ if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+ raise ValueError(
+ f"Make sure slice_size {slice_size} is a divisor of "
+ f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+ )
+ if slice_size is not None and slice_size > self.attn_num_head_channels:
+ raise ValueError(
+ f"Chunk_size {slice_size} has to be smaller or equal to "
+ f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+ )
+
+ for attn in self.attentions:
+ attn._set_attention_slice(slice_size)
+
+ self.gradient_checkpointing = False
+
+ def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+ for attn in self.attentions:
+ attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+ def forward(
+ self,
+ hidden_states,
+ res_hidden_states_tuple,
+ temb=None,
+ encoder_hidden_states=None,
+ upsample_size=None,
+ ):
+ for resnet, attn in zip(self.resnets, self.attentions):
+ # pop res hidden states
+ res_hidden_states = res_hidden_states_tuple[-1]
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+ hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+ if self.training and self.gradient_checkpointing:
+
+ def create_custom_forward(module, return_dict=None):
+ def custom_forward(*inputs):
+ if return_dict is not None:
+ return module(*inputs, return_dict=return_dict)
+ else:
+ return module(*inputs)
+
+ return custom_forward
+
+ hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+ )[0]
+ else:
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+
+ if self.upsamplers is not None:
+ for upsampler in self.upsamplers:
+ hidden_states = upsampler(hidden_states, upsample_size)
+
+ return hidden_states
+
+
+class UpBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ prev_output_channel: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor=1.0,
+ add_upsample=True,
+ ):
+ super().__init__()
+ resnets = []
+
+ for i in range(num_layers):
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+ resnets.append(
+ ResnetBlock2D(
+ in_channels=resnet_in_channels + res_skip_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+
+ self.resnets = nn.ModuleList(resnets)
+
+ if add_upsample:
+ self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+ else:
+ self.upsamplers = None
+
+ self.gradient_checkpointing = False
+
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+ for resnet in self.resnets:
+ # pop res hidden states
+ res_hidden_states = res_hidden_states_tuple[-1]
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+ hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+ if self.training and self.gradient_checkpointing:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs)
+
+ return custom_forward
+
+ hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+ else:
+ hidden_states = resnet(hidden_states, temb)
+
+ if self.upsamplers is not None:
+ for upsampler in self.upsamplers:
+ hidden_states = upsampler(hidden_states, upsample_size)
+
+ return hidden_states
+
+
+class UpDecoderBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ output_scale_factor=1.0,
+ add_upsample=True,
+ ):
+ super().__init__()
+ resnets = []
+
+ for i in range(num_layers):
+ input_channels = in_channels if i == 0 else out_channels
+
+ resnets.append(
+ ResnetBlock2D(
+ in_channels=input_channels,
+ out_channels=out_channels,
+ temb_channels=None,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+
+ self.resnets = nn.ModuleList(resnets)
+
+ if add_upsample:
+ self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+ else:
+ self.upsamplers = None
+
+ def forward(self, hidden_states):
+ for resnet in self.resnets:
+ hidden_states = resnet(hidden_states, temb=None)
+
+ if self.upsamplers is not None:
+ for upsampler in self.upsamplers:
+ hidden_states = upsampler(hidden_states)
+
+ return hidden_states
+
+
+class AttnUpDecoderBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ out_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_groups: int = 32,
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels=1,
+ output_scale_factor=1.0,
+ add_upsample=True,
+ ):
+ super().__init__()
+ resnets = []
+ attentions = []
+
+ for i in range(num_layers):
+ input_channels = in_channels if i == 0 else out_channels
+
+ resnets.append(
+ ResnetBlock2D(
+ in_channels=input_channels,
+ out_channels=out_channels,
+ temb_channels=None,
+ eps=resnet_eps,
+ groups=resnet_groups,
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+ attentions.append(
+ AttentionBlock(
+ out_channels,
+ num_head_channels=attn_num_head_channels,
+ rescale_output_factor=output_scale_factor,
+ eps=resnet_eps,
+ norm_num_groups=resnet_groups,
+ )
+ )
+
+ self.attentions = nn.ModuleList(attentions)
+ self.resnets = nn.ModuleList(resnets)
+
+ if add_upsample:
+ self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+ else:
+ self.upsamplers = None
+
+ def forward(self, hidden_states):
+ for resnet, attn in zip(self.resnets, self.attentions):
+ hidden_states = resnet(hidden_states, temb=None)
+ hidden_states = attn(hidden_states)
+
+ if self.upsamplers is not None:
+ for upsampler in self.upsamplers:
+ hidden_states = upsampler(hidden_states)
+
+ return hidden_states
+
+
+class AttnSkipUpBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ prev_output_channel: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_pre_norm: bool = True,
+ attn_num_head_channels=1,
+ attention_type="default",
+ output_scale_factor=np.sqrt(2.0),
+ upsample_padding=1,
+ add_upsample=True,
+ ):
+ super().__init__()
+ self.attentions = nn.ModuleList([])
+ self.resnets = nn.ModuleList([])
+
+ self.attention_type = attention_type
+
+ for i in range(num_layers):
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+ self.resnets.append(
+ ResnetBlock2D(
+ in_channels=resnet_in_channels + res_skip_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=min(resnet_in_channels + res_skip_channels // 4, 32),
+ groups_out=min(out_channels // 4, 32),
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+
+ self.attentions.append(
+ AttentionBlock(
+ out_channels,
+ num_head_channels=attn_num_head_channels,
+ rescale_output_factor=output_scale_factor,
+ eps=resnet_eps,
+ )
+ )
+
+ self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+ if add_upsample:
+ self.resnet_up = ResnetBlock2D(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=min(out_channels // 4, 32),
+ groups_out=min(out_channels // 4, 32),
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ use_in_shortcut=True,
+ up=True,
+ kernel="fir",
+ )
+ self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+ self.skip_norm = torch.nn.GroupNorm(
+ num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+ )
+ self.act = nn.SiLU()
+ else:
+ self.resnet_up = None
+ self.skip_conv = None
+ self.skip_norm = None
+ self.act = None
+
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
+ for resnet in self.resnets:
+ # pop res hidden states
+ res_hidden_states = res_hidden_states_tuple[-1]
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+ hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+ hidden_states = resnet(hidden_states, temb)
+
+ hidden_states = self.attentions[0](hidden_states)
+
+ if skip_sample is not None:
+ skip_sample = self.upsampler(skip_sample)
+ else:
+ skip_sample = 0
+
+ if self.resnet_up is not None:
+ skip_sample_states = self.skip_norm(hidden_states)
+ skip_sample_states = self.act(skip_sample_states)
+ skip_sample_states = self.skip_conv(skip_sample_states)
+
+ skip_sample = skip_sample + skip_sample_states
+
+ hidden_states = self.resnet_up(hidden_states, temb)
+
+ return hidden_states, skip_sample
+
+
+class SkipUpBlock2D(nn.Module):
+ def __init__(
+ self,
+ in_channels: int,
+ prev_output_channel: int,
+ out_channels: int,
+ temb_channels: int,
+ dropout: float = 0.0,
+ num_layers: int = 1,
+ resnet_eps: float = 1e-6,
+ resnet_time_scale_shift: str = "default",
+ resnet_act_fn: str = "swish",
+ resnet_pre_norm: bool = True,
+ output_scale_factor=np.sqrt(2.0),
+ add_upsample=True,
+ upsample_padding=1,
+ ):
+ super().__init__()
+ self.resnets = nn.ModuleList([])
+
+ for i in range(num_layers):
+ res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+ resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+ self.resnets.append(
+ ResnetBlock2D(
+ in_channels=resnet_in_channels + res_skip_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=min((resnet_in_channels + res_skip_channels) // 4, 32),
+ groups_out=min(out_channels // 4, 32),
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ )
+ )
+
+ self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+ if add_upsample:
+ self.resnet_up = ResnetBlock2D(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ eps=resnet_eps,
+ groups=min(out_channels // 4, 32),
+ groups_out=min(out_channels // 4, 32),
+ dropout=dropout,
+ time_embedding_norm=resnet_time_scale_shift,
+ non_linearity=resnet_act_fn,
+ output_scale_factor=output_scale_factor,
+ pre_norm=resnet_pre_norm,
+ use_in_shortcut=True,
+ up=True,
+ kernel="fir",
+ )
+ self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+ self.skip_norm = torch.nn.GroupNorm(
+ num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+ )
+ self.act = nn.SiLU()
+ else:
+ self.resnet_up = None
+ self.skip_conv = None
+ self.skip_norm = None
+ self.act = None
+
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
+ for resnet in self.resnets:
+ # pop res hidden states
+ res_hidden_states = res_hidden_states_tuple[-1]
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+ hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+ hidden_states = resnet(hidden_states, temb)
+
+ if skip_sample is not None:
+ skip_sample = self.upsampler(skip_sample)
+ else:
+ skip_sample = 0
+
+ if self.resnet_up is not None:
+ skip_sample_states = self.skip_norm(hidden_states)
+ skip_sample_states = self.act(skip_sample_states)
+ skip_sample_states = self.skip_conv(skip_sample_states)
+
+ skip_sample = skip_sample + skip_sample_states
+
+ hidden_states = self.resnet_up(hidden_states, temb)
+
+ return hidden_states, skip_sample
diff --git a/src/model/TextGen/diffusers/models/unet_2d_blocks_flax.py b/src/model/TextGen/diffusers/models/unet_2d_blocks_flax.py
new file mode 100644
index 0000000..5798385
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/unet_2d_blocks_flax.py
@@ -0,0 +1,355 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import flax.linen as nn
+import jax.numpy as jnp
+
+from .attention_flax import FlaxTransformer2DModel
+from .resnet_flax import FlaxDownsample2D, FlaxResnetBlock2D, FlaxUpsample2D
+
+
+class FlaxCrossAttnDownBlock2D(nn.Module):
+ r"""
+ Cross Attention 2D Downsizing block - original architecture from Unet transformers:
+ https://arxiv.org/abs/2103.06104
+
+ Parameters:
+ in_channels (:obj:`int`):
+ Input channels
+ out_channels (:obj:`int`):
+ Output channels
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ num_layers (:obj:`int`, *optional*, defaults to 1):
+ Number of attention blocks layers
+ attn_num_head_channels (:obj:`int`, *optional*, defaults to 1):
+ Number of attention heads of each spatial transformer block
+ add_downsample (:obj:`bool`, *optional*, defaults to `True`):
+ Whether to add downsampling layer before each final output
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ in_channels: int
+ out_channels: int
+ dropout: float = 0.0
+ num_layers: int = 1
+ attn_num_head_channels: int = 1
+ add_downsample: bool = True
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ resnets = []
+ attentions = []
+
+ for i in range(self.num_layers):
+ in_channels = self.in_channels if i == 0 else self.out_channels
+
+ res_block = FlaxResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=self.out_channels,
+ dropout_prob=self.dropout,
+ dtype=self.dtype,
+ )
+ resnets.append(res_block)
+
+ attn_block = FlaxTransformer2DModel(
+ in_channels=self.out_channels,
+ n_heads=self.attn_num_head_channels,
+ d_head=self.out_channels // self.attn_num_head_channels,
+ depth=1,
+ dtype=self.dtype,
+ )
+ attentions.append(attn_block)
+
+ self.resnets = resnets
+ self.attentions = attentions
+
+ if self.add_downsample:
+ self.downsamplers_0 = FlaxDownsample2D(self.out_channels, dtype=self.dtype)
+
+ def __call__(self, hidden_states, temb, encoder_hidden_states, deterministic=True):
+ output_states = ()
+
+ for resnet, attn in zip(self.resnets, self.attentions):
+ hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+ hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic)
+ output_states += (hidden_states,)
+
+ if self.add_downsample:
+ hidden_states = self.downsamplers_0(hidden_states)
+ output_states += (hidden_states,)
+
+ return hidden_states, output_states
+
+
+class FlaxDownBlock2D(nn.Module):
+ r"""
+ Flax 2D downsizing block
+
+ Parameters:
+ in_channels (:obj:`int`):
+ Input channels
+ out_channels (:obj:`int`):
+ Output channels
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ num_layers (:obj:`int`, *optional*, defaults to 1):
+ Number of attention blocks layers
+ add_downsample (:obj:`bool`, *optional*, defaults to `True`):
+ Whether to add downsampling layer before each final output
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ in_channels: int
+ out_channels: int
+ dropout: float = 0.0
+ num_layers: int = 1
+ add_downsample: bool = True
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ resnets = []
+
+ for i in range(self.num_layers):
+ in_channels = self.in_channels if i == 0 else self.out_channels
+
+ res_block = FlaxResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=self.out_channels,
+ dropout_prob=self.dropout,
+ dtype=self.dtype,
+ )
+ resnets.append(res_block)
+ self.resnets = resnets
+
+ if self.add_downsample:
+ self.downsamplers_0 = FlaxDownsample2D(self.out_channels, dtype=self.dtype)
+
+ def __call__(self, hidden_states, temb, deterministic=True):
+ output_states = ()
+
+ for resnet in self.resnets:
+ hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+ output_states += (hidden_states,)
+
+ if self.add_downsample:
+ hidden_states = self.downsamplers_0(hidden_states)
+ output_states += (hidden_states,)
+
+ return hidden_states, output_states
+
+
+class FlaxCrossAttnUpBlock2D(nn.Module):
+ r"""
+ Cross Attention 2D Upsampling block - original architecture from Unet transformers:
+ https://arxiv.org/abs/2103.06104
+
+ Parameters:
+ in_channels (:obj:`int`):
+ Input channels
+ out_channels (:obj:`int`):
+ Output channels
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ num_layers (:obj:`int`, *optional*, defaults to 1):
+ Number of attention blocks layers
+ attn_num_head_channels (:obj:`int`, *optional*, defaults to 1):
+ Number of attention heads of each spatial transformer block
+ add_upsample (:obj:`bool`, *optional*, defaults to `True`):
+ Whether to add upsampling layer before each final output
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ in_channels: int
+ out_channels: int
+ prev_output_channel: int
+ dropout: float = 0.0
+ num_layers: int = 1
+ attn_num_head_channels: int = 1
+ add_upsample: bool = True
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ resnets = []
+ attentions = []
+
+ for i in range(self.num_layers):
+ res_skip_channels = self.in_channels if (i == self.num_layers - 1) else self.out_channels
+ resnet_in_channels = self.prev_output_channel if i == 0 else self.out_channels
+
+ res_block = FlaxResnetBlock2D(
+ in_channels=resnet_in_channels + res_skip_channels,
+ out_channels=self.out_channels,
+ dropout_prob=self.dropout,
+ dtype=self.dtype,
+ )
+ resnets.append(res_block)
+
+ attn_block = FlaxTransformer2DModel(
+ in_channels=self.out_channels,
+ n_heads=self.attn_num_head_channels,
+ d_head=self.out_channels // self.attn_num_head_channels,
+ depth=1,
+ dtype=self.dtype,
+ )
+ attentions.append(attn_block)
+
+ self.resnets = resnets
+ self.attentions = attentions
+
+ if self.add_upsample:
+ self.upsamplers_0 = FlaxUpsample2D(self.out_channels, dtype=self.dtype)
+
+ def __call__(self, hidden_states, res_hidden_states_tuple, temb, encoder_hidden_states, deterministic=True):
+ for resnet, attn in zip(self.resnets, self.attentions):
+ # pop res hidden states
+ res_hidden_states = res_hidden_states_tuple[-1]
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+ hidden_states = jnp.concatenate((hidden_states, res_hidden_states), axis=-1)
+
+ hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+ hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic)
+
+ if self.add_upsample:
+ hidden_states = self.upsamplers_0(hidden_states)
+
+ return hidden_states
+
+
+class FlaxUpBlock2D(nn.Module):
+ r"""
+ Flax 2D upsampling block
+
+ Parameters:
+ in_channels (:obj:`int`):
+ Input channels
+ out_channels (:obj:`int`):
+ Output channels
+ prev_output_channel (:obj:`int`):
+ Output channels from the previous block
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ num_layers (:obj:`int`, *optional*, defaults to 1):
+ Number of attention blocks layers
+ add_downsample (:obj:`bool`, *optional*, defaults to `True`):
+ Whether to add downsampling layer before each final output
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ in_channels: int
+ out_channels: int
+ prev_output_channel: int
+ dropout: float = 0.0
+ num_layers: int = 1
+ add_upsample: bool = True
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ resnets = []
+
+ for i in range(self.num_layers):
+ res_skip_channels = self.in_channels if (i == self.num_layers - 1) else self.out_channels
+ resnet_in_channels = self.prev_output_channel if i == 0 else self.out_channels
+
+ res_block = FlaxResnetBlock2D(
+ in_channels=resnet_in_channels + res_skip_channels,
+ out_channels=self.out_channels,
+ dropout_prob=self.dropout,
+ dtype=self.dtype,
+ )
+ resnets.append(res_block)
+
+ self.resnets = resnets
+
+ if self.add_upsample:
+ self.upsamplers_0 = FlaxUpsample2D(self.out_channels, dtype=self.dtype)
+
+ def __call__(self, hidden_states, res_hidden_states_tuple, temb, deterministic=True):
+ for resnet in self.resnets:
+ # pop res hidden states
+ res_hidden_states = res_hidden_states_tuple[-1]
+ res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+ hidden_states = jnp.concatenate((hidden_states, res_hidden_states), axis=-1)
+
+ hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+
+ if self.add_upsample:
+ hidden_states = self.upsamplers_0(hidden_states)
+
+ return hidden_states
+
+
+class FlaxUNetMidBlock2DCrossAttn(nn.Module):
+ r"""
+ Cross Attention 2D Mid-level block - original architecture from Unet transformers: https://arxiv.org/abs/2103.06104
+
+ Parameters:
+ in_channels (:obj:`int`):
+ Input channels
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ num_layers (:obj:`int`, *optional*, defaults to 1):
+ Number of attention blocks layers
+ attn_num_head_channels (:obj:`int`, *optional*, defaults to 1):
+ Number of attention heads of each spatial transformer block
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ in_channels: int
+ dropout: float = 0.0
+ num_layers: int = 1
+ attn_num_head_channels: int = 1
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ # there is always at least one resnet
+ resnets = [
+ FlaxResnetBlock2D(
+ in_channels=self.in_channels,
+ out_channels=self.in_channels,
+ dropout_prob=self.dropout,
+ dtype=self.dtype,
+ )
+ ]
+
+ attentions = []
+
+ for _ in range(self.num_layers):
+ attn_block = FlaxTransformer2DModel(
+ in_channels=self.in_channels,
+ n_heads=self.attn_num_head_channels,
+ d_head=self.in_channels // self.attn_num_head_channels,
+ depth=1,
+ dtype=self.dtype,
+ )
+ attentions.append(attn_block)
+
+ res_block = FlaxResnetBlock2D(
+ in_channels=self.in_channels,
+ out_channels=self.in_channels,
+ dropout_prob=self.dropout,
+ dtype=self.dtype,
+ )
+ resnets.append(res_block)
+
+ self.resnets = resnets
+ self.attentions = attentions
+
+ def __call__(self, hidden_states, temb, encoder_hidden_states, deterministic=True):
+ hidden_states = self.resnets[0](hidden_states, temb)
+ for attn, resnet in zip(self.attentions, self.resnets[1:]):
+ hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic)
+ hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+
+ return hidden_states
diff --git a/src/model/TextGen/diffusers/models/unet_2d_condition.py b/src/model/TextGen/diffusers/models/unet_2d_condition.py
new file mode 100644
index 0000000..438bbd9
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/unet_2d_condition.py
@@ -0,0 +1,353 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union, Dict
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..modeling_utils import ModelMixin
+from ..utils import BaseOutput, logging
+from .embeddings import TimestepEmbedding, Timesteps
+from .unet_2d_blocks import (
+ CrossAttnDownBlock2D,
+ CrossAttnUpBlock2D,
+ DownBlock2D,
+ UNetMidBlock2DCrossAttn,
+ UpBlock2D,
+ get_down_block,
+ get_up_block,
+)
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+ """
+ Args:
+ sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
+ """
+
+ sample: torch.FloatTensor
+
+
+class UNet2DConditionModel(ModelMixin, ConfigMixin):
+ r"""
+ UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
+ and returns sample shaped output.
+
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+ implements for all the models (such as downloading or saving, etc.)
+
+ Parameters:
+ sample_size (`int`, *optional*): The size of the input sample.
+ in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+ out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+ center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+ flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+ Whether to flip the sin to cos in the time embedding.
+ freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+ down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+ The tuple of downsample blocks to use.
+ up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+ The tuple of upsample blocks to use.
+ block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+ The tuple of output channels for each block.
+ layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+ downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+ mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+ act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+ norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+ norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+ cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+ attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+ """
+
+ _supports_gradient_checkpointing = True
+
+ @register_to_config
+ def __init__(
+ self,
+ sample_size: Optional[int] = None,
+ in_channels: int = 4,
+ out_channels: int = 4,
+ center_input_sample: bool = False,
+ flip_sin_to_cos: bool = True,
+ freq_shift: int = 0,
+ down_block_types: Tuple[str] = (
+ "CrossAttnDownBlock2D",
+ "CrossAttnDownBlock2D",
+ "CrossAttnDownBlock2D",
+ "DownBlock2D",
+ ),
+ up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+ layers_per_block: int = 2,
+ downsample_padding: int = 1,
+ mid_block_scale_factor: float = 1,
+ act_fn: str = "silu",
+ norm_num_groups: int = 32,
+ norm_eps: float = 1e-5,
+ cross_attention_dim: int = 1280,
+ attention_head_dim: int = 8,
+ ):
+ super().__init__()
+
+ self.sample_size = sample_size
+ time_embed_dim = block_out_channels[0] * 4
+
+ # input
+ self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+
+ # time
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+ timestep_input_dim = block_out_channels[0]
+
+ self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+ self.down_blocks = nn.ModuleList([])
+ self.mid_block = None
+ self.up_blocks = nn.ModuleList([])
+
+ # down
+ output_channel = block_out_channels[0]
+ for i, down_block_type in enumerate(down_block_types):
+ input_channel = output_channel
+ output_channel = block_out_channels[i]
+ is_final_block = i == len(block_out_channels) - 1
+
+ down_block = get_down_block(
+ down_block_type,
+ num_layers=layers_per_block,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ temb_channels=time_embed_dim,
+ add_downsample=not is_final_block,
+ resnet_eps=norm_eps,
+ resnet_act_fn=act_fn,
+ resnet_groups=norm_num_groups,
+ cross_attention_dim=cross_attention_dim,
+ attn_num_head_channels=attention_head_dim,
+ downsample_padding=downsample_padding,
+ )
+ self.down_blocks.append(down_block)
+
+ # mid
+ self.mid_block = UNetMidBlock2DCrossAttn(
+ in_channels=block_out_channels[-1],
+ temb_channels=time_embed_dim,
+ resnet_eps=norm_eps,
+ resnet_act_fn=act_fn,
+ output_scale_factor=mid_block_scale_factor,
+ resnet_time_scale_shift="default",
+ cross_attention_dim=cross_attention_dim,
+ attn_num_head_channels=attention_head_dim,
+ resnet_groups=norm_num_groups,
+ )
+
+ # count how many layers upsample the images
+ self.num_upsamplers = 0
+
+ # up
+ reversed_block_out_channels = list(reversed(block_out_channels))
+ output_channel = reversed_block_out_channels[0]
+ for i, up_block_type in enumerate(up_block_types):
+ is_final_block = i == len(block_out_channels) - 1
+
+ prev_output_channel = output_channel
+ output_channel = reversed_block_out_channels[i]
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+ # add upsample block for all BUT final layer
+ if not is_final_block:
+ add_upsample = True
+ self.num_upsamplers += 1
+ else:
+ add_upsample = False
+
+ up_block = get_up_block(
+ up_block_type,
+ num_layers=layers_per_block + 1,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ prev_output_channel=prev_output_channel,
+ temb_channels=time_embed_dim,
+ add_upsample=add_upsample,
+ resnet_eps=norm_eps,
+ resnet_act_fn=act_fn,
+ resnet_groups=norm_num_groups,
+ cross_attention_dim=cross_attention_dim,
+ attn_num_head_channels=attention_head_dim,
+ )
+ self.up_blocks.append(up_block)
+ prev_output_channel = output_channel
+
+ # out
+ self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
+ self.conv_act = nn.SiLU()
+ self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+
+ def set_attention_slice(self, slice_size):
+ if slice_size is not None and self.config.attention_head_dim % slice_size != 0:
+ raise ValueError(
+ f"Make sure slice_size {slice_size} is a divisor of "
+ f"the number of heads used in cross_attention {self.config.attention_head_dim}"
+ )
+ if slice_size is not None and slice_size > self.config.attention_head_dim:
+ raise ValueError(
+ f"Chunk_size {slice_size} has to be smaller or equal to "
+ f"the number of heads used in cross_attention {self.config.attention_head_dim}"
+ )
+
+ for block in self.down_blocks:
+ if hasattr(block, "attentions") and block.attentions is not None:
+ block.set_attention_slice(slice_size)
+
+ self.mid_block.set_attention_slice(slice_size)
+
+ for block in self.up_blocks:
+ if hasattr(block, "attentions") and block.attentions is not None:
+ block.set_attention_slice(slice_size)
+
+ def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+ for block in self.down_blocks:
+ if hasattr(block, "attentions") and block.attentions is not None:
+ block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+ self.mid_block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+ for block in self.up_blocks:
+ if hasattr(block, "attentions") and block.attentions is not None:
+ block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
+ module.gradient_checkpointing = value
+
+ def forward(
+ self,
+ sample: torch.FloatTensor,
+ timestep: Union[torch.Tensor, float, int],
+ encoder_hidden_states: torch.Tensor,
+ char_hidden_states: torch.Tensor = None,
+ return_dict: bool = True,
+ ) -> Union[UNet2DConditionOutput, Tuple]:
+ r"""
+ Args:
+ sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+ timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+ encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+ [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+ returning a tuple, the first element is the sample tensor.
+ """
+ # By default samples have to be AT least a multiple of the overall upsampling factor.
+ # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+ # However, the upsampling interpolation output size can be forced to fit any upsampling size
+ # on the fly if necessary.
+ default_overall_up_factor = 2**self.num_upsamplers
+
+ # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+ forward_upsample_size = False
+ upsample_size = None
+
+ if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+ logger.info("Forward upsample size to force interpolation output size.")
+ forward_upsample_size = True
+
+ # 0. center input if necessary
+ if self.config.center_input_sample:
+ sample = 2 * sample - 1.0
+
+ # 1. time
+ timesteps = timestep
+ if not torch.is_tensor(timesteps):
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+ timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+ elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+ timesteps = timesteps[None].to(sample.device)
+
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+ timesteps = timesteps.expand(sample.shape[0])
+
+ t_emb = self.time_proj(timesteps)
+
+ # timesteps does not contain any weights and will always return f32 tensors
+ # but time_embedding might actually be running in fp16. so we need to cast here.
+ # there might be better ways to encapsulate this.
+ t_emb = t_emb.to(dtype=self.dtype)
+ emb = self.time_embedding(t_emb)
+
+ # 2. pre-process
+ sample = self.conv_in(sample)
+
+ # 3. down
+ down_block_res_samples = (sample,)
+ for downsample_block in self.down_blocks:
+ if hasattr(downsample_block, "attentions") and downsample_block.attentions is not None:
+ sample, res_samples = downsample_block(
+ hidden_states=sample,
+ temb=emb,
+ encoder_hidden_states=encoder_hidden_states,
+ )
+ else:
+ sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+ down_block_res_samples += res_samples
+
+ # 4. mid
+ sample = self.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states)
+
+ # 5. up
+ for i, upsample_block in enumerate(self.up_blocks):
+ is_final_block = i == len(self.up_blocks) - 1
+
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+ # if we have not reached the final block and need to forward the
+ # upsample size, we do it here
+ if not is_final_block and forward_upsample_size:
+ upsample_size = down_block_res_samples[-1].shape[2:]
+
+ if hasattr(upsample_block, "attentions") and upsample_block.attentions is not None:
+ sample = upsample_block(
+ hidden_states=sample,
+ temb=emb,
+ res_hidden_states_tuple=res_samples,
+ encoder_hidden_states=encoder_hidden_states,
+ upsample_size=upsample_size,
+ )
+ else:
+ sample = upsample_block(
+ hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+ )
+ # 6. post-process
+ sample = self.conv_norm_out(sample)
+ sample = self.conv_act(sample)
+ sample = self.conv_out(sample)
+
+ if not return_dict:
+ return (sample,)
+
+ return UNet2DConditionOutput(sample=sample)
diff --git a/src/model/TextGen/diffusers/models/unet_2d_condition_flax.py b/src/model/TextGen/diffusers/models/unet_2d_condition_flax.py
new file mode 100644
index 0000000..f0e7218
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/unet_2d_condition_flax.py
@@ -0,0 +1,297 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple, Union
+
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict
+
+from ..configuration_utils import ConfigMixin, flax_register_to_config
+from ..modeling_flax_utils import FlaxModelMixin
+from ..utils import BaseOutput
+from .embeddings_flax import FlaxTimestepEmbedding, FlaxTimesteps
+from .unet_2d_blocks_flax import (
+ FlaxCrossAttnDownBlock2D,
+ FlaxCrossAttnUpBlock2D,
+ FlaxDownBlock2D,
+ FlaxUNetMidBlock2DCrossAttn,
+ FlaxUpBlock2D,
+)
+
+
+@flax.struct.dataclass
+class FlaxUNet2DConditionOutput(BaseOutput):
+ """
+ Args:
+ sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
+ Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
+ """
+
+ sample: jnp.ndarray
+
+
+@flax_register_to_config
+class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
+ r"""
+ FlaxUNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a
+ timestep and returns sample shaped output.
+
+ This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for the generic methods the library
+ implements for all the models (such as downloading or saving, etc.)
+
+ Also, this model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+ subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+ general usage and behavior.
+
+ Finally, this model supports inherent JAX features such as:
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+ Parameters:
+ sample_size (`int`, *optional*):
+ The size of the input sample.
+ in_channels (`int`, *optional*, defaults to 4):
+ The number of channels in the input sample.
+ out_channels (`int`, *optional*, defaults to 4):
+ The number of channels in the output.
+ down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+ The tuple of downsample blocks to use. The corresponding class names will be: "FlaxCrossAttnDownBlock2D",
+ "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D"
+ up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+ The tuple of upsample blocks to use. The corresponding class names will be: "FlaxUpBlock2D",
+ "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D"
+ block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+ The tuple of output channels for each block.
+ layers_per_block (`int`, *optional*, defaults to 2):
+ The number of layers per block.
+ attention_head_dim (`int`, *optional*, defaults to 8):
+ The dimension of the attention heads.
+ cross_attention_dim (`int`, *optional*, defaults to 768):
+ The dimension of the cross attention features.
+ dropout (`float`, *optional*, defaults to 0):
+ Dropout probability for down, up and bottleneck blocks.
+ """
+
+ sample_size: int = 32
+ in_channels: int = 4
+ out_channels: int = 4
+ down_block_types: Tuple[str] = (
+ "CrossAttnDownBlock2D",
+ "CrossAttnDownBlock2D",
+ "CrossAttnDownBlock2D",
+ "DownBlock2D",
+ )
+ up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280)
+ layers_per_block: int = 2
+ attention_head_dim: int = 8
+ cross_attention_dim: int = 1280
+ dropout: float = 0.0
+ dtype: jnp.dtype = jnp.float32
+ freq_shift: int = 0
+
+ def init_weights(self, rng: jax.random.PRNGKey) -> FrozenDict:
+ # init input tensors
+ sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
+ sample = jnp.zeros(sample_shape, dtype=jnp.float32)
+ timesteps = jnp.ones((1,), dtype=jnp.int32)
+ encoder_hidden_states = jnp.zeros((1, 1, self.cross_attention_dim), dtype=jnp.float32)
+
+ params_rng, dropout_rng = jax.random.split(rng)
+ rngs = {"params": params_rng, "dropout": dropout_rng}
+
+ return self.init(rngs, sample, timesteps, encoder_hidden_states)["params"]
+
+ def setup(self):
+ block_out_channels = self.block_out_channels
+ time_embed_dim = block_out_channels[0] * 4
+
+ # input
+ self.conv_in = nn.Conv(
+ block_out_channels[0],
+ kernel_size=(3, 3),
+ strides=(1, 1),
+ padding=((1, 1), (1, 1)),
+ dtype=self.dtype,
+ )
+
+ # time
+ self.time_proj = FlaxTimesteps(block_out_channels[0], freq_shift=self.config.freq_shift)
+ self.time_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
+
+ # down
+ down_blocks = []
+ output_channel = block_out_channels[0]
+ for i, down_block_type in enumerate(self.down_block_types):
+ input_channel = output_channel
+ output_channel = block_out_channels[i]
+ is_final_block = i == len(block_out_channels) - 1
+
+ if down_block_type == "CrossAttnDownBlock2D":
+ down_block = FlaxCrossAttnDownBlock2D(
+ in_channels=input_channel,
+ out_channels=output_channel,
+ dropout=self.dropout,
+ num_layers=self.layers_per_block,
+ attn_num_head_channels=self.attention_head_dim,
+ add_downsample=not is_final_block,
+ dtype=self.dtype,
+ )
+ else:
+ down_block = FlaxDownBlock2D(
+ in_channels=input_channel,
+ out_channels=output_channel,
+ dropout=self.dropout,
+ num_layers=self.layers_per_block,
+ add_downsample=not is_final_block,
+ dtype=self.dtype,
+ )
+
+ down_blocks.append(down_block)
+ self.down_blocks = down_blocks
+
+ # mid
+ self.mid_block = FlaxUNetMidBlock2DCrossAttn(
+ in_channels=block_out_channels[-1],
+ dropout=self.dropout,
+ attn_num_head_channels=self.attention_head_dim,
+ dtype=self.dtype,
+ )
+
+ # up
+ up_blocks = []
+ reversed_block_out_channels = list(reversed(block_out_channels))
+ output_channel = reversed_block_out_channels[0]
+ for i, up_block_type in enumerate(self.up_block_types):
+ prev_output_channel = output_channel
+ output_channel = reversed_block_out_channels[i]
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+ is_final_block = i == len(block_out_channels) - 1
+
+ if up_block_type == "CrossAttnUpBlock2D":
+ up_block = FlaxCrossAttnUpBlock2D(
+ in_channels=input_channel,
+ out_channels=output_channel,
+ prev_output_channel=prev_output_channel,
+ num_layers=self.layers_per_block + 1,
+ attn_num_head_channels=self.attention_head_dim,
+ add_upsample=not is_final_block,
+ dropout=self.dropout,
+ dtype=self.dtype,
+ )
+ else:
+ up_block = FlaxUpBlock2D(
+ in_channels=input_channel,
+ out_channels=output_channel,
+ prev_output_channel=prev_output_channel,
+ num_layers=self.layers_per_block + 1,
+ add_upsample=not is_final_block,
+ dropout=self.dropout,
+ dtype=self.dtype,
+ )
+
+ up_blocks.append(up_block)
+ prev_output_channel = output_channel
+ self.up_blocks = up_blocks
+
+ # out
+ self.conv_norm_out = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+ self.conv_out = nn.Conv(
+ self.out_channels,
+ kernel_size=(3, 3),
+ strides=(1, 1),
+ padding=((1, 1), (1, 1)),
+ dtype=self.dtype,
+ )
+
+ def __call__(
+ self,
+ sample,
+ timesteps,
+ encoder_hidden_states,
+ return_dict: bool = True,
+ train: bool = False,
+ ) -> Union[FlaxUNet2DConditionOutput, Tuple]:
+ r"""
+ Args:
+ sample (`jnp.ndarray`): (channel, height, width) noisy inputs tensor
+ timestep (`jnp.ndarray` or `float` or `int`): timesteps
+ encoder_hidden_states (`jnp.ndarray`): (channel, height, width) encoder hidden states
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`models.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of a
+ plain tuple.
+ train (`bool`, *optional*, defaults to `False`):
+ Use deterministic functions and disable dropout when not training.
+
+ Returns:
+ [`~models.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`:
+ [`~models.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`.
+ When returning a tuple, the first element is the sample tensor.
+ """
+ # 1. time
+ if not isinstance(timesteps, jnp.ndarray):
+ timesteps = jnp.array([timesteps], dtype=jnp.int32)
+ elif isinstance(timesteps, jnp.ndarray) and len(timesteps.shape) == 0:
+ timesteps = timesteps.astype(dtype=jnp.float32)
+ timesteps = jnp.expand_dims(timesteps, 0)
+
+ t_emb = self.time_proj(timesteps)
+ t_emb = self.time_embedding(t_emb)
+
+ # 2. pre-process
+ sample = jnp.transpose(sample, (0, 2, 3, 1))
+ sample = self.conv_in(sample)
+
+ # 3. down
+ down_block_res_samples = (sample,)
+ for down_block in self.down_blocks:
+ if isinstance(down_block, FlaxCrossAttnDownBlock2D):
+ sample, res_samples = down_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
+ else:
+ sample, res_samples = down_block(sample, t_emb, deterministic=not train)
+ down_block_res_samples += res_samples
+
+ # 4. mid
+ sample = self.mid_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
+
+ # 5. up
+ for up_block in self.up_blocks:
+ res_samples = down_block_res_samples[-(self.layers_per_block + 1) :]
+ down_block_res_samples = down_block_res_samples[: -(self.layers_per_block + 1)]
+ if isinstance(up_block, FlaxCrossAttnUpBlock2D):
+ sample = up_block(
+ sample,
+ temb=t_emb,
+ encoder_hidden_states=encoder_hidden_states,
+ res_hidden_states_tuple=res_samples,
+ deterministic=not train,
+ )
+ else:
+ sample = up_block(sample, temb=t_emb, res_hidden_states_tuple=res_samples, deterministic=not train)
+
+ # 6. post-process
+ sample = self.conv_norm_out(sample)
+ sample = nn.silu(sample)
+ sample = self.conv_out(sample)
+ sample = jnp.transpose(sample, (0, 3, 1, 2))
+
+ if not return_dict:
+ return (sample,)
+
+ return FlaxUNet2DConditionOutput(sample=sample)
diff --git a/src/model/TextGen/diffusers/models/vae.py b/src/model/TextGen/diffusers/models/vae.py
new file mode 100644
index 0000000..30de343
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/vae.py
@@ -0,0 +1,614 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..modeling_utils import ModelMixin
+from ..utils import BaseOutput
+from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
+
+
+@dataclass
+class DecoderOutput(BaseOutput):
+ """
+ Output of decoding method.
+
+ Args:
+ sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Decoded output sample of the model. Output of the last layer of the model.
+ """
+
+ sample: torch.FloatTensor
+
+
+@dataclass
+class VQEncoderOutput(BaseOutput):
+ """
+ Output of VQModel encoding method.
+
+ Args:
+ latents (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Encoded output sample of the model. Output of the last layer of the model.
+ """
+
+ latents: torch.FloatTensor
+
+
+@dataclass
+class AutoencoderKLOutput(BaseOutput):
+ """
+ Output of AutoencoderKL encoding method.
+
+ Args:
+ latent_dist (`DiagonalGaussianDistribution`):
+ Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
+ `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
+ """
+
+ latent_dist: "DiagonalGaussianDistribution"
+
+
+class Encoder(nn.Module):
+ def __init__(
+ self,
+ in_channels=3,
+ out_channels=3,
+ down_block_types=("DownEncoderBlock2D",),
+ block_out_channels=(64,),
+ layers_per_block=2,
+ norm_num_groups=32,
+ act_fn="silu",
+ double_z=True,
+ ):
+ super().__init__()
+ self.layers_per_block = layers_per_block
+
+ self.conv_in = torch.nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
+
+ self.mid_block = None
+ self.down_blocks = nn.ModuleList([])
+
+ # down
+ output_channel = block_out_channels[0]
+ for i, down_block_type in enumerate(down_block_types):
+ input_channel = output_channel
+ output_channel = block_out_channels[i]
+ is_final_block = i == len(block_out_channels) - 1
+
+ down_block = get_down_block(
+ down_block_type,
+ num_layers=self.layers_per_block,
+ in_channels=input_channel,
+ out_channels=output_channel,
+ add_downsample=not is_final_block,
+ resnet_eps=1e-6,
+ downsample_padding=0,
+ resnet_act_fn=act_fn,
+ resnet_groups=norm_num_groups,
+ attn_num_head_channels=None,
+ temb_channels=None,
+ )
+ self.down_blocks.append(down_block)
+
+ # mid
+ self.mid_block = UNetMidBlock2D(
+ in_channels=block_out_channels[-1],
+ resnet_eps=1e-6,
+ resnet_act_fn=act_fn,
+ output_scale_factor=1,
+ resnet_time_scale_shift="default",
+ attn_num_head_channels=None,
+ resnet_groups=norm_num_groups,
+ temb_channels=None,
+ )
+
+ # out
+ self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+ self.conv_act = nn.SiLU()
+
+ conv_out_channels = 2 * out_channels if double_z else out_channels
+ self.conv_out = nn.Conv2d(block_out_channels[-1], conv_out_channels, 3, padding=1)
+
+ def forward(self, x):
+ sample = x
+ sample = self.conv_in(sample)
+
+ # down
+ for down_block in self.down_blocks:
+ sample = down_block(sample)
+
+ # middle
+ sample = self.mid_block(sample)
+
+ # post-process
+ sample = self.conv_norm_out(sample)
+ sample = self.conv_act(sample)
+ sample = self.conv_out(sample)
+
+ return sample
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self,
+ in_channels=3,
+ out_channels=3,
+ up_block_types=("UpDecoderBlock2D",),
+ block_out_channels=(64,),
+ layers_per_block=2,
+ norm_num_groups=32,
+ act_fn="silu",
+ ):
+ super().__init__()
+ self.layers_per_block = layers_per_block
+
+ self.conv_in = nn.Conv2d(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1)
+
+ self.mid_block = None
+ self.up_blocks = nn.ModuleList([])
+
+ # mid
+ self.mid_block = UNetMidBlock2D(
+ in_channels=block_out_channels[-1],
+ resnet_eps=1e-6,
+ resnet_act_fn=act_fn,
+ output_scale_factor=1,
+ resnet_time_scale_shift="default",
+ attn_num_head_channels=None,
+ resnet_groups=norm_num_groups,
+ temb_channels=None,
+ )
+
+ # up
+ reversed_block_out_channels = list(reversed(block_out_channels))
+ output_channel = reversed_block_out_channels[0]
+ for i, up_block_type in enumerate(up_block_types):
+ prev_output_channel = output_channel
+ output_channel = reversed_block_out_channels[i]
+
+ is_final_block = i == len(block_out_channels) - 1
+
+ up_block = get_up_block(
+ up_block_type,
+ num_layers=self.layers_per_block + 1,
+ in_channels=prev_output_channel,
+ out_channels=output_channel,
+ prev_output_channel=None,
+ add_upsample=not is_final_block,
+ resnet_eps=1e-6,
+ resnet_act_fn=act_fn,
+ resnet_groups=norm_num_groups,
+ attn_num_head_channels=None,
+ temb_channels=None,
+ )
+ self.up_blocks.append(up_block)
+ prev_output_channel = output_channel
+
+ # out
+ self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+ self.conv_act = nn.SiLU()
+ self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+
+ def forward(self, z):
+ sample = z
+ sample = self.conv_in(sample)
+
+ # middle
+ sample = self.mid_block(sample)
+
+ # up
+ for up_block in self.up_blocks:
+ sample = up_block(sample)
+
+ # post-process
+ sample = self.conv_norm_out(sample)
+ sample = self.conv_act(sample)
+ sample = self.conv_out(sample)
+
+ return sample
+
+
+class VectorQuantizer(nn.Module):
+ """
+ Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix
+ multiplications and allows for post-hoc remapping of indices.
+ """
+
+ # NOTE: due to a bug the beta term was applied to the wrong term. for
+ # backwards compatibility we use the buggy version by default, but you can
+ # specify legacy=False to fix it.
+ def __init__(
+ self, n_e, vq_embed_dim, beta, remap=None, unknown_index="random", sane_index_shape=False, legacy=True
+ ):
+ super().__init__()
+ self.n_e = n_e
+ self.vq_embed_dim = vq_embed_dim
+ self.beta = beta
+ self.legacy = legacy
+
+ self.embedding = nn.Embedding(self.n_e, self.vq_embed_dim)
+ self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+
+ self.remap = remap
+ if self.remap is not None:
+ self.register_buffer("used", torch.tensor(np.load(self.remap)))
+ self.re_embed = self.used.shape[0]
+ self.unknown_index = unknown_index # "random" or "extra" or integer
+ if self.unknown_index == "extra":
+ self.unknown_index = self.re_embed
+ self.re_embed = self.re_embed + 1
+ print(
+ f"Remapping {self.n_e} indices to {self.re_embed} indices. "
+ f"Using {self.unknown_index} for unknown indices."
+ )
+ else:
+ self.re_embed = n_e
+
+ self.sane_index_shape = sane_index_shape
+
+ def remap_to_used(self, inds):
+ ishape = inds.shape
+ assert len(ishape) > 1
+ inds = inds.reshape(ishape[0], -1)
+ used = self.used.to(inds)
+ match = (inds[:, :, None] == used[None, None, ...]).long()
+ new = match.argmax(-1)
+ unknown = match.sum(2) < 1
+ if self.unknown_index == "random":
+ new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device)
+ else:
+ new[unknown] = self.unknown_index
+ return new.reshape(ishape)
+
+ def unmap_to_all(self, inds):
+ ishape = inds.shape
+ assert len(ishape) > 1
+ inds = inds.reshape(ishape[0], -1)
+ used = self.used.to(inds)
+ if self.re_embed > self.used.shape[0]: # extra token
+ inds[inds >= self.used.shape[0]] = 0 # simply set to zero
+ back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
+ return back.reshape(ishape)
+
+ def forward(self, z):
+ # reshape z -> (batch, height, width, channel) and flatten
+ z = z.permute(0, 2, 3, 1).contiguous()
+ z_flattened = z.view(-1, self.vq_embed_dim)
+ # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+
+ d = (
+ torch.sum(z_flattened**2, dim=1, keepdim=True)
+ + torch.sum(self.embedding.weight**2, dim=1)
+ - 2 * torch.einsum("bd,dn->bn", z_flattened, self.embedding.weight.t())
+ )
+
+ min_encoding_indices = torch.argmin(d, dim=1)
+ z_q = self.embedding(min_encoding_indices).view(z.shape)
+ perplexity = None
+ min_encodings = None
+
+ # compute loss for embedding
+ if not self.legacy:
+ loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean((z_q - z.detach()) ** 2)
+ else:
+ loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
+
+ # preserve gradients
+ z_q = z + (z_q - z).detach()
+
+ # reshape back to match original input shape
+ z_q = z_q.permute(0, 3, 1, 2).contiguous()
+
+ if self.remap is not None:
+ min_encoding_indices = min_encoding_indices.reshape(z.shape[0], -1) # add batch axis
+ min_encoding_indices = self.remap_to_used(min_encoding_indices)
+ min_encoding_indices = min_encoding_indices.reshape(-1, 1) # flatten
+
+ if self.sane_index_shape:
+ min_encoding_indices = min_encoding_indices.reshape(z_q.shape[0], z_q.shape[2], z_q.shape[3])
+
+ return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
+
+ def get_codebook_entry(self, indices, shape):
+ # shape specifying (batch, height, width, channel)
+ if self.remap is not None:
+ indices = indices.reshape(shape[0], -1) # add batch axis
+ indices = self.unmap_to_all(indices)
+ indices = indices.reshape(-1) # flatten again
+
+ # get quantized latent vectors
+ z_q = self.embedding(indices)
+
+ if shape is not None:
+ z_q = z_q.view(shape)
+ # reshape back to match original input shape
+ z_q = z_q.permute(0, 3, 1, 2).contiguous()
+
+ return z_q
+
+
+class DiagonalGaussianDistribution(object):
+ def __init__(self, parameters, deterministic=False):
+ self.parameters = parameters
+ self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+ self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+ self.deterministic = deterministic
+ self.std = torch.exp(0.5 * self.logvar)
+ self.var = torch.exp(self.logvar)
+ if self.deterministic:
+ self.var = self.std = torch.zeros_like(
+ self.mean, device=self.parameters.device, dtype=self.parameters.dtype
+ )
+
+ def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+ device = self.parameters.device
+ sample_device = "cpu" if device.type == "mps" else device
+ sample = torch.randn(self.mean.shape, generator=generator, device=sample_device)
+ # make sure sample is on the same device as the parameters and has same dtype
+ sample = sample.to(device=device, dtype=self.parameters.dtype)
+ x = self.mean + self.std * sample
+ return x
+
+ def kl(self, other=None):
+ if self.deterministic:
+ return torch.Tensor([0.0])
+ else:
+ if other is None:
+ return 0.5 * torch.sum(torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, dim=[1, 2, 3])
+ else:
+ return 0.5 * torch.sum(
+ torch.pow(self.mean - other.mean, 2) / other.var
+ + self.var / other.var
+ - 1.0
+ - self.logvar
+ + other.logvar,
+ dim=[1, 2, 3],
+ )
+
+ def nll(self, sample, dims=[1, 2, 3]):
+ if self.deterministic:
+ return torch.Tensor([0.0])
+ logtwopi = np.log(2.0 * np.pi)
+ return 0.5 * torch.sum(logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, dim=dims)
+
+ def mode(self):
+ return self.mean
+
+
+class VQModel(ModelMixin, ConfigMixin):
+ r"""VQ-VAE model from the paper Neural Discrete Representation Learning by Aaron van den Oord, Oriol Vinyals and Koray
+ Kavukcuoglu.
+
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+ implements for all the model (such as downloading or saving, etc.)
+
+ Parameters:
+ in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+ out_channels (int, *optional*, defaults to 3): Number of channels in the output.
+ down_block_types (`Tuple[str]`, *optional*, defaults to :
+ obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types.
+ up_block_types (`Tuple[str]`, *optional*, defaults to :
+ obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types.
+ block_out_channels (`Tuple[int]`, *optional*, defaults to :
+ obj:`(64,)`): Tuple of block output channels.
+ act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+ latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
+ sample_size (`int`, *optional*, defaults to `32`): TODO
+ num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
+ vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
+ """
+
+ @register_to_config
+ def __init__(
+ self,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+ up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+ block_out_channels: Tuple[int] = (64,),
+ layers_per_block: int = 1,
+ act_fn: str = "silu",
+ latent_channels: int = 3,
+ sample_size: int = 32,
+ num_vq_embeddings: int = 256,
+ norm_num_groups: int = 32,
+ vq_embed_dim: Optional[int] = None,
+ ):
+ super().__init__()
+
+ # pass init params to Encoder
+ self.encoder = Encoder(
+ in_channels=in_channels,
+ out_channels=latent_channels,
+ down_block_types=down_block_types,
+ block_out_channels=block_out_channels,
+ layers_per_block=layers_per_block,
+ act_fn=act_fn,
+ norm_num_groups=norm_num_groups,
+ double_z=False,
+ )
+
+ vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
+
+ self.quant_conv = torch.nn.Conv2d(latent_channels, vq_embed_dim, 1)
+ self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False)
+ self.post_quant_conv = torch.nn.Conv2d(vq_embed_dim, latent_channels, 1)
+
+ # pass init params to Decoder
+ self.decoder = Decoder(
+ in_channels=latent_channels,
+ out_channels=out_channels,
+ up_block_types=up_block_types,
+ block_out_channels=block_out_channels,
+ layers_per_block=layers_per_block,
+ act_fn=act_fn,
+ norm_num_groups=norm_num_groups,
+ )
+
+ def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput:
+ h = self.encoder(x)
+ h = self.quant_conv(h)
+
+ if not return_dict:
+ return (h,)
+
+ return VQEncoderOutput(latents=h)
+
+ def decode(
+ self, h: torch.FloatTensor, force_not_quantize: bool = False, return_dict: bool = True
+ ) -> Union[DecoderOutput, torch.FloatTensor]:
+ # also go through quantization layer
+ if not force_not_quantize:
+ quant, emb_loss, info = self.quantize(h)
+ else:
+ quant = h
+ quant = self.post_quant_conv(quant)
+ dec = self.decoder(quant)
+
+ if not return_dict:
+ return (dec,)
+
+ return DecoderOutput(sample=dec)
+
+ def forward(self, sample: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+ r"""
+ Args:
+ sample (`torch.FloatTensor`): Input sample.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+ """
+ x = sample
+ h = self.encode(x).latents
+ dec = self.decode(h).sample
+
+ if not return_dict:
+ return (dec,)
+
+ return DecoderOutput(sample=dec)
+
+
+class AutoencoderKL(ModelMixin, ConfigMixin):
+ r"""Variational Autoencoder (VAE) model with KL loss from the paper Auto-Encoding Variational Bayes by Diederik P. Kingma
+ and Max Welling.
+
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+ implements for all the model (such as downloading or saving, etc.)
+
+ Parameters:
+ in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+ out_channels (int, *optional*, defaults to 3): Number of channels in the output.
+ down_block_types (`Tuple[str]`, *optional*, defaults to :
+ obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types.
+ up_block_types (`Tuple[str]`, *optional*, defaults to :
+ obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types.
+ block_out_channels (`Tuple[int]`, *optional*, defaults to :
+ obj:`(64,)`): Tuple of block output channels.
+ act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+ latent_channels (`int`, *optional*, defaults to `4`): Number of channels in the latent space.
+ sample_size (`int`, *optional*, defaults to `32`): TODO
+ """
+
+ @register_to_config
+ def __init__(
+ self,
+ in_channels: int = 3,
+ out_channels: int = 3,
+ down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+ up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+ block_out_channels: Tuple[int] = (64,),
+ layers_per_block: int = 1,
+ act_fn: str = "silu",
+ latent_channels: int = 4,
+ norm_num_groups: int = 32,
+ sample_size: int = 32,
+ ):
+ super().__init__()
+
+ # pass init params to Encoder
+ self.encoder = Encoder(
+ in_channels=in_channels,
+ out_channels=latent_channels,
+ down_block_types=down_block_types,
+ block_out_channels=block_out_channels,
+ layers_per_block=layers_per_block,
+ act_fn=act_fn,
+ norm_num_groups=norm_num_groups,
+ double_z=True,
+ )
+
+ # pass init params to Decoder
+ self.decoder = Decoder(
+ in_channels=latent_channels,
+ out_channels=out_channels,
+ up_block_types=up_block_types,
+ block_out_channels=block_out_channels,
+ layers_per_block=layers_per_block,
+ norm_num_groups=norm_num_groups,
+ act_fn=act_fn,
+ )
+
+ self.quant_conv = torch.nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
+ self.post_quant_conv = torch.nn.Conv2d(latent_channels, latent_channels, 1)
+
+ def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
+ h = self.encoder(x)
+ moments = self.quant_conv(h)
+ posterior = DiagonalGaussianDistribution(moments)
+
+ if not return_dict:
+ return (posterior,)
+
+ return AutoencoderKLOutput(latent_dist=posterior)
+
+ def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+ z = self.post_quant_conv(z)
+ dec = self.decoder(z)
+
+ if not return_dict:
+ return (dec,)
+
+ return DecoderOutput(sample=dec)
+
+ def forward(
+ self,
+ sample: torch.FloatTensor,
+ sample_posterior: bool = False,
+ return_dict: bool = True,
+ generator: Optional[torch.Generator] = None,
+ ) -> Union[DecoderOutput, torch.FloatTensor]:
+ r"""
+ Args:
+ sample (`torch.FloatTensor`): Input sample.
+ sample_posterior (`bool`, *optional*, defaults to `False`):
+ Whether to sample from the posterior.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+ """
+ x = sample
+ posterior = self.encode(x).latent_dist
+ if sample_posterior:
+ z = posterior.sample(generator=generator)
+ else:
+ z = posterior.mode()
+ dec = self.decode(z).sample
+
+ if not return_dict:
+ return (dec,)
+
+ return DecoderOutput(sample=dec)
diff --git a/src/model/TextGen/diffusers/models/vae_flax.py b/src/model/TextGen/diffusers/models/vae_flax.py
new file mode 100644
index 0000000..7ecda9a
--- /dev/null
+++ b/src/model/TextGen/diffusers/models/vae_flax.py
@@ -0,0 +1,858 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# JAX implementation of VQGAN from taming-transformers https://github.com/CompVis/taming-transformers
+
+import math
+from functools import partial
+from typing import Tuple
+
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict
+
+from ..configuration_utils import ConfigMixin, flax_register_to_config
+from ..modeling_flax_utils import FlaxModelMixin
+from ..utils import BaseOutput
+
+
+@flax.struct.dataclass
+class FlaxDecoderOutput(BaseOutput):
+ """
+ Output of decoding method.
+
+ Args:
+ sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
+ Decoded output sample of the model. Output of the last layer of the model.
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+
+ sample: jnp.ndarray
+
+
+@flax.struct.dataclass
+class FlaxAutoencoderKLOutput(BaseOutput):
+ """
+ Output of AutoencoderKL encoding method.
+
+ Args:
+ latent_dist (`FlaxDiagonalGaussianDistribution`):
+ Encoded outputs of `Encoder` represented as the mean and logvar of `FlaxDiagonalGaussianDistribution`.
+ `FlaxDiagonalGaussianDistribution` allows for sampling latents from the distribution.
+ """
+
+ latent_dist: "FlaxDiagonalGaussianDistribution"
+
+
+class FlaxUpsample2D(nn.Module):
+ """
+ Flax implementation of 2D Upsample layer
+
+ Args:
+ in_channels (`int`):
+ Input channels
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+
+ in_channels: int
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ self.conv = nn.Conv(
+ self.in_channels,
+ kernel_size=(3, 3),
+ strides=(1, 1),
+ padding=((1, 1), (1, 1)),
+ dtype=self.dtype,
+ )
+
+ def __call__(self, hidden_states):
+ batch, height, width, channels = hidden_states.shape
+ hidden_states = jax.image.resize(
+ hidden_states,
+ shape=(batch, height * 2, width * 2, channels),
+ method="nearest",
+ )
+ hidden_states = self.conv(hidden_states)
+ return hidden_states
+
+
+class FlaxDownsample2D(nn.Module):
+ """
+ Flax implementation of 2D Downsample layer
+
+ Args:
+ in_channels (`int`):
+ Input channels
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+
+ in_channels: int
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ self.conv = nn.Conv(
+ self.in_channels,
+ kernel_size=(3, 3),
+ strides=(2, 2),
+ padding="VALID",
+ dtype=self.dtype,
+ )
+
+ def __call__(self, hidden_states):
+ pad = ((0, 0), (0, 1), (0, 1), (0, 0)) # pad height and width dim
+ hidden_states = jnp.pad(hidden_states, pad_width=pad)
+ hidden_states = self.conv(hidden_states)
+ return hidden_states
+
+
+class FlaxResnetBlock2D(nn.Module):
+ """
+ Flax implementation of 2D Resnet Block.
+
+ Args:
+ in_channels (`int`):
+ Input channels
+ out_channels (`int`):
+ Output channels
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ groups (:obj:`int`, *optional*, defaults to `32`):
+ The number of groups to use for group norm.
+ use_nin_shortcut (:obj:`bool`, *optional*, defaults to `None`):
+ Whether to use `nin_shortcut`. This activates a new layer inside ResNet block
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+
+ in_channels: int
+ out_channels: int = None
+ dropout: float = 0.0
+ groups: int = 32
+ use_nin_shortcut: bool = None
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ out_channels = self.in_channels if self.out_channels is None else self.out_channels
+
+ self.norm1 = nn.GroupNorm(num_groups=self.groups, epsilon=1e-6)
+ self.conv1 = nn.Conv(
+ out_channels,
+ kernel_size=(3, 3),
+ strides=(1, 1),
+ padding=((1, 1), (1, 1)),
+ dtype=self.dtype,
+ )
+
+ self.norm2 = nn.GroupNorm(num_groups=self.groups, epsilon=1e-6)
+ self.dropout_layer = nn.Dropout(self.dropout)
+ self.conv2 = nn.Conv(
+ out_channels,
+ kernel_size=(3, 3),
+ strides=(1, 1),
+ padding=((1, 1), (1, 1)),
+ dtype=self.dtype,
+ )
+
+ use_nin_shortcut = self.in_channels != out_channels if self.use_nin_shortcut is None else self.use_nin_shortcut
+
+ self.conv_shortcut = None
+ if use_nin_shortcut:
+ self.conv_shortcut = nn.Conv(
+ out_channels,
+ kernel_size=(1, 1),
+ strides=(1, 1),
+ padding="VALID",
+ dtype=self.dtype,
+ )
+
+ def __call__(self, hidden_states, deterministic=True):
+ residual = hidden_states
+ hidden_states = self.norm1(hidden_states)
+ hidden_states = nn.swish(hidden_states)
+ hidden_states = self.conv1(hidden_states)
+
+ hidden_states = self.norm2(hidden_states)
+ hidden_states = nn.swish(hidden_states)
+ hidden_states = self.dropout_layer(hidden_states, deterministic)
+ hidden_states = self.conv2(hidden_states)
+
+ if self.conv_shortcut is not None:
+ residual = self.conv_shortcut(residual)
+
+ return hidden_states + residual
+
+
+class FlaxAttentionBlock(nn.Module):
+ r"""
+ Flax Convolutional based multi-head attention block for diffusion-based VAE.
+
+ Parameters:
+ channels (:obj:`int`):
+ Input channels
+ num_head_channels (:obj:`int`, *optional*, defaults to `None`):
+ Number of attention heads
+ num_groups (:obj:`int`, *optional*, defaults to `32`):
+ The number of groups to use for group norm
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+
+ """
+ channels: int
+ num_head_channels: int = None
+ num_groups: int = 32
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ self.num_heads = self.channels // self.num_head_channels if self.num_head_channels is not None else 1
+
+ dense = partial(nn.Dense, self.channels, dtype=self.dtype)
+
+ self.group_norm = nn.GroupNorm(num_groups=self.num_groups, epsilon=1e-6)
+ self.query, self.key, self.value = dense(), dense(), dense()
+ self.proj_attn = dense()
+
+ def transpose_for_scores(self, projection):
+ new_projection_shape = projection.shape[:-1] + (self.num_heads, -1)
+ # move heads to 2nd position (B, T, H * D) -> (B, T, H, D)
+ new_projection = projection.reshape(new_projection_shape)
+ # (B, T, H, D) -> (B, H, T, D)
+ new_projection = jnp.transpose(new_projection, (0, 2, 1, 3))
+ return new_projection
+
+ def __call__(self, hidden_states):
+ residual = hidden_states
+ batch, height, width, channels = hidden_states.shape
+
+ hidden_states = self.group_norm(hidden_states)
+
+ hidden_states = hidden_states.reshape((batch, height * width, channels))
+
+ query = self.query(hidden_states)
+ key = self.key(hidden_states)
+ value = self.value(hidden_states)
+
+ # transpose
+ query = self.transpose_for_scores(query)
+ key = self.transpose_for_scores(key)
+ value = self.transpose_for_scores(value)
+
+ # compute attentions
+ scale = 1 / math.sqrt(math.sqrt(self.channels / self.num_heads))
+ attn_weights = jnp.einsum("...qc,...kc->...qk", query * scale, key * scale)
+ attn_weights = nn.softmax(attn_weights, axis=-1)
+
+ # attend to values
+ hidden_states = jnp.einsum("...kc,...qk->...qc", value, attn_weights)
+
+ hidden_states = jnp.transpose(hidden_states, (0, 2, 1, 3))
+ new_hidden_states_shape = hidden_states.shape[:-2] + (self.channels,)
+ hidden_states = hidden_states.reshape(new_hidden_states_shape)
+
+ hidden_states = self.proj_attn(hidden_states)
+ hidden_states = hidden_states.reshape((batch, height, width, channels))
+ hidden_states = hidden_states + residual
+ return hidden_states
+
+
+class FlaxDownEncoderBlock2D(nn.Module):
+ r"""
+ Flax Resnet blocks-based Encoder block for diffusion-based VAE.
+
+ Parameters:
+ in_channels (:obj:`int`):
+ Input channels
+ out_channels (:obj:`int`):
+ Output channels
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ num_layers (:obj:`int`, *optional*, defaults to 1):
+ Number of Resnet layer block
+ resnet_groups (:obj:`int`, *optional*, defaults to `32`):
+ The number of groups to use for the Resnet block group norm
+ add_downsample (:obj:`bool`, *optional*, defaults to `True`):
+ Whether to add downsample layer
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ in_channels: int
+ out_channels: int
+ dropout: float = 0.0
+ num_layers: int = 1
+ resnet_groups: int = 32
+ add_downsample: bool = True
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ resnets = []
+ for i in range(self.num_layers):
+ in_channels = self.in_channels if i == 0 else self.out_channels
+
+ res_block = FlaxResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=self.out_channels,
+ dropout=self.dropout,
+ groups=self.resnet_groups,
+ dtype=self.dtype,
+ )
+ resnets.append(res_block)
+ self.resnets = resnets
+
+ if self.add_downsample:
+ self.downsamplers_0 = FlaxDownsample2D(self.out_channels, dtype=self.dtype)
+
+ def __call__(self, hidden_states, deterministic=True):
+ for resnet in self.resnets:
+ hidden_states = resnet(hidden_states, deterministic=deterministic)
+
+ if self.add_downsample:
+ hidden_states = self.downsamplers_0(hidden_states)
+
+ return hidden_states
+
+
+class FlaxUpDecoderBlock2D(nn.Module):
+ r"""
+ Flax Resnet blocks-based Decoder block for diffusion-based VAE.
+
+ Parameters:
+ in_channels (:obj:`int`):
+ Input channels
+ out_channels (:obj:`int`):
+ Output channels
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ num_layers (:obj:`int`, *optional*, defaults to 1):
+ Number of Resnet layer block
+ resnet_groups (:obj:`int`, *optional*, defaults to `32`):
+ The number of groups to use for the Resnet block group norm
+ add_upsample (:obj:`bool`, *optional*, defaults to `True`):
+ Whether to add upsample layer
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ in_channels: int
+ out_channels: int
+ dropout: float = 0.0
+ num_layers: int = 1
+ resnet_groups: int = 32
+ add_upsample: bool = True
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ resnets = []
+ for i in range(self.num_layers):
+ in_channels = self.in_channels if i == 0 else self.out_channels
+ res_block = FlaxResnetBlock2D(
+ in_channels=in_channels,
+ out_channels=self.out_channels,
+ dropout=self.dropout,
+ groups=self.resnet_groups,
+ dtype=self.dtype,
+ )
+ resnets.append(res_block)
+
+ self.resnets = resnets
+
+ if self.add_upsample:
+ self.upsamplers_0 = FlaxUpsample2D(self.out_channels, dtype=self.dtype)
+
+ def __call__(self, hidden_states, deterministic=True):
+ for resnet in self.resnets:
+ hidden_states = resnet(hidden_states, deterministic=deterministic)
+
+ if self.add_upsample:
+ hidden_states = self.upsamplers_0(hidden_states)
+
+ return hidden_states
+
+
+class FlaxUNetMidBlock2D(nn.Module):
+ r"""
+ Flax Unet Mid-Block module.
+
+ Parameters:
+ in_channels (:obj:`int`):
+ Input channels
+ dropout (:obj:`float`, *optional*, defaults to 0.0):
+ Dropout rate
+ num_layers (:obj:`int`, *optional*, defaults to 1):
+ Number of Resnet layer block
+ resnet_groups (:obj:`int`, *optional*, defaults to `32`):
+ The number of groups to use for the Resnet and Attention block group norm
+ attn_num_head_channels (:obj:`int`, *optional*, defaults to `1`):
+ Number of attention heads for each attention block
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ in_channels: int
+ dropout: float = 0.0
+ num_layers: int = 1
+ resnet_groups: int = 32
+ attn_num_head_channels: int = 1
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ resnet_groups = self.resnet_groups if self.resnet_groups is not None else min(self.in_channels // 4, 32)
+
+ # there is always at least one resnet
+ resnets = [
+ FlaxResnetBlock2D(
+ in_channels=self.in_channels,
+ out_channels=self.in_channels,
+ dropout=self.dropout,
+ groups=resnet_groups,
+ dtype=self.dtype,
+ )
+ ]
+
+ attentions = []
+
+ for _ in range(self.num_layers):
+ attn_block = FlaxAttentionBlock(
+ channels=self.in_channels,
+ num_head_channels=self.attn_num_head_channels,
+ num_groups=resnet_groups,
+ dtype=self.dtype,
+ )
+ attentions.append(attn_block)
+
+ res_block = FlaxResnetBlock2D(
+ in_channels=self.in_channels,
+ out_channels=self.in_channels,
+ dropout=self.dropout,
+ groups=resnet_groups,
+ dtype=self.dtype,
+ )
+ resnets.append(res_block)
+
+ self.resnets = resnets
+ self.attentions = attentions
+
+ def __call__(self, hidden_states, deterministic=True):
+ hidden_states = self.resnets[0](hidden_states, deterministic=deterministic)
+ for attn, resnet in zip(self.attentions, self.resnets[1:]):
+ hidden_states = attn(hidden_states)
+ hidden_states = resnet(hidden_states, deterministic=deterministic)
+
+ return hidden_states
+
+
+class FlaxEncoder(nn.Module):
+ r"""
+ Flax Implementation of VAE Encoder.
+
+ This model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+ subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+ general usage and behavior.
+
+ Finally, this model supports inherent JAX features such as:
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+ Parameters:
+ in_channels (:obj:`int`, *optional*, defaults to 3):
+ Input channels
+ out_channels (:obj:`int`, *optional*, defaults to 3):
+ Output channels
+ down_block_types (:obj:`Tuple[str]`, *optional*, defaults to `(DownEncoderBlock2D)`):
+ DownEncoder block type
+ block_out_channels (:obj:`Tuple[str]`, *optional*, defaults to `(64,)`):
+ Tuple containing the number of output channels for each block
+ layers_per_block (:obj:`int`, *optional*, defaults to `2`):
+ Number of Resnet layer for each block
+ norm_num_groups (:obj:`int`, *optional*, defaults to `32`):
+ norm num group
+ act_fn (:obj:`str`, *optional*, defaults to `silu`):
+ Activation function
+ double_z (:obj:`bool`, *optional*, defaults to `False`):
+ Whether to double the last output channels
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ Parameters `dtype`
+ """
+ in_channels: int = 3
+ out_channels: int = 3
+ down_block_types: Tuple[str] = ("DownEncoderBlock2D",)
+ block_out_channels: Tuple[int] = (64,)
+ layers_per_block: int = 2
+ norm_num_groups: int = 32
+ act_fn: str = "silu"
+ double_z: bool = False
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ block_out_channels = self.block_out_channels
+ # in
+ self.conv_in = nn.Conv(
+ block_out_channels[0],
+ kernel_size=(3, 3),
+ strides=(1, 1),
+ padding=((1, 1), (1, 1)),
+ dtype=self.dtype,
+ )
+
+ # downsampling
+ down_blocks = []
+ output_channel = block_out_channels[0]
+ for i, _ in enumerate(self.down_block_types):
+ input_channel = output_channel
+ output_channel = block_out_channels[i]
+ is_final_block = i == len(block_out_channels) - 1
+
+ down_block = FlaxDownEncoderBlock2D(
+ in_channels=input_channel,
+ out_channels=output_channel,
+ num_layers=self.layers_per_block,
+ resnet_groups=self.norm_num_groups,
+ add_downsample=not is_final_block,
+ dtype=self.dtype,
+ )
+ down_blocks.append(down_block)
+ self.down_blocks = down_blocks
+
+ # middle
+ self.mid_block = FlaxUNetMidBlock2D(
+ in_channels=block_out_channels[-1],
+ resnet_groups=self.norm_num_groups,
+ attn_num_head_channels=None,
+ dtype=self.dtype,
+ )
+
+ # end
+ conv_out_channels = 2 * self.out_channels if self.double_z else self.out_channels
+ self.conv_norm_out = nn.GroupNorm(num_groups=self.norm_num_groups, epsilon=1e-6)
+ self.conv_out = nn.Conv(
+ conv_out_channels,
+ kernel_size=(3, 3),
+ strides=(1, 1),
+ padding=((1, 1), (1, 1)),
+ dtype=self.dtype,
+ )
+
+ def __call__(self, sample, deterministic: bool = True):
+ # in
+ sample = self.conv_in(sample)
+
+ # downsampling
+ for block in self.down_blocks:
+ sample = block(sample, deterministic=deterministic)
+
+ # middle
+ sample = self.mid_block(sample, deterministic=deterministic)
+
+ # end
+ sample = self.conv_norm_out(sample)
+ sample = nn.swish(sample)
+ sample = self.conv_out(sample)
+
+ return sample
+
+
+class FlaxDecoder(nn.Module):
+ r"""
+ Flax Implementation of VAE Decoder.
+
+ This model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+ subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+ general usage and behavior.
+
+ Finally, this model supports inherent JAX features such as:
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+ Parameters:
+ in_channels (:obj:`int`, *optional*, defaults to 3):
+ Input channels
+ out_channels (:obj:`int`, *optional*, defaults to 3):
+ Output channels
+ up_block_types (:obj:`Tuple[str]`, *optional*, defaults to `(UpDecoderBlock2D)`):
+ UpDecoder block type
+ block_out_channels (:obj:`Tuple[str]`, *optional*, defaults to `(64,)`):
+ Tuple containing the number of output channels for each block
+ layers_per_block (:obj:`int`, *optional*, defaults to `2`):
+ Number of Resnet layer for each block
+ norm_num_groups (:obj:`int`, *optional*, defaults to `32`):
+ norm num group
+ act_fn (:obj:`str`, *optional*, defaults to `silu`):
+ Activation function
+ double_z (:obj:`bool`, *optional*, defaults to `False`):
+ Whether to double the last output channels
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ parameters `dtype`
+ """
+ in_channels: int = 3
+ out_channels: int = 3
+ up_block_types: Tuple[str] = ("UpDecoderBlock2D",)
+ block_out_channels: int = (64,)
+ layers_per_block: int = 2
+ norm_num_groups: int = 32
+ act_fn: str = "silu"
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ block_out_channels = self.block_out_channels
+
+ # z to block_in
+ self.conv_in = nn.Conv(
+ block_out_channels[-1],
+ kernel_size=(3, 3),
+ strides=(1, 1),
+ padding=((1, 1), (1, 1)),
+ dtype=self.dtype,
+ )
+
+ # middle
+ self.mid_block = FlaxUNetMidBlock2D(
+ in_channels=block_out_channels[-1],
+ resnet_groups=self.norm_num_groups,
+ attn_num_head_channels=None,
+ dtype=self.dtype,
+ )
+
+ # upsampling
+ reversed_block_out_channels = list(reversed(block_out_channels))
+ output_channel = reversed_block_out_channels[0]
+ up_blocks = []
+ for i, _ in enumerate(self.up_block_types):
+ prev_output_channel = output_channel
+ output_channel = reversed_block_out_channels[i]
+
+ is_final_block = i == len(block_out_channels) - 1
+
+ up_block = FlaxUpDecoderBlock2D(
+ in_channels=prev_output_channel,
+ out_channels=output_channel,
+ num_layers=self.layers_per_block + 1,
+ resnet_groups=self.norm_num_groups,
+ add_upsample=not is_final_block,
+ dtype=self.dtype,
+ )
+ up_blocks.append(up_block)
+ prev_output_channel = output_channel
+
+ self.up_blocks = up_blocks
+
+ # end
+ self.conv_norm_out = nn.GroupNorm(num_groups=self.norm_num_groups, epsilon=1e-6)
+ self.conv_out = nn.Conv(
+ self.out_channels,
+ kernel_size=(3, 3),
+ strides=(1, 1),
+ padding=((1, 1), (1, 1)),
+ dtype=self.dtype,
+ )
+
+ def __call__(self, sample, deterministic: bool = True):
+ # z to block_in
+ sample = self.conv_in(sample)
+
+ # middle
+ sample = self.mid_block(sample, deterministic=deterministic)
+
+ # upsampling
+ for block in self.up_blocks:
+ sample = block(sample, deterministic=deterministic)
+
+ sample = self.conv_norm_out(sample)
+ sample = nn.swish(sample)
+ sample = self.conv_out(sample)
+
+ return sample
+
+
+class FlaxDiagonalGaussianDistribution(object):
+ def __init__(self, parameters, deterministic=False):
+ # Last axis to account for channels-last
+ self.mean, self.logvar = jnp.split(parameters, 2, axis=-1)
+ self.logvar = jnp.clip(self.logvar, -30.0, 20.0)
+ self.deterministic = deterministic
+ self.std = jnp.exp(0.5 * self.logvar)
+ self.var = jnp.exp(self.logvar)
+ if self.deterministic:
+ self.var = self.std = jnp.zeros_like(self.mean)
+
+ def sample(self, key):
+ return self.mean + self.std * jax.random.normal(key, self.mean.shape)
+
+ def kl(self, other=None):
+ if self.deterministic:
+ return jnp.array([0.0])
+
+ if other is None:
+ return 0.5 * jnp.sum(self.mean**2 + self.var - 1.0 - self.logvar, axis=[1, 2, 3])
+
+ return 0.5 * jnp.sum(
+ jnp.square(self.mean - other.mean) / other.var + self.var / other.var - 1.0 - self.logvar + other.logvar,
+ axis=[1, 2, 3],
+ )
+
+ def nll(self, sample, axis=[1, 2, 3]):
+ if self.deterministic:
+ return jnp.array([0.0])
+
+ logtwopi = jnp.log(2.0 * jnp.pi)
+ return 0.5 * jnp.sum(logtwopi + self.logvar + jnp.square(sample - self.mean) / self.var, axis=axis)
+
+ def mode(self):
+ return self.mean
+
+
+@flax_register_to_config
+class FlaxAutoencoderKL(nn.Module, FlaxModelMixin, ConfigMixin):
+ r"""
+ Flax Implementation of Variational Autoencoder (VAE) model with KL loss from the paper Auto-Encoding Variational
+ Bayes by Diederik P. Kingma and Max Welling.
+
+ This model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+ subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+ general usage and behavior.
+
+ Finally, this model supports inherent JAX features such as:
+ - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+ - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+ - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+ - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+ Parameters:
+ in_channels (:obj:`int`, *optional*, defaults to 3):
+ Input channels
+ out_channels (:obj:`int`, *optional*, defaults to 3):
+ Output channels
+ down_block_types (:obj:`Tuple[str]`, *optional*, defaults to `(DownEncoderBlock2D)`):
+ DownEncoder block type
+ up_block_types (:obj:`Tuple[str]`, *optional*, defaults to `(UpDecoderBlock2D)`):
+ UpDecoder block type
+ block_out_channels (:obj:`Tuple[str]`, *optional*, defaults to `(64,)`):
+ Tuple containing the number of output channels for each block
+ layers_per_block (:obj:`int`, *optional*, defaults to `2`):
+ Number of Resnet layer for each block
+ act_fn (:obj:`str`, *optional*, defaults to `silu`):
+ Activation function
+ latent_channels (:obj:`int`, *optional*, defaults to `4`):
+ Latent space channels
+ norm_num_groups (:obj:`int`, *optional*, defaults to `32`):
+ Norm num group
+ sample_size (:obj:`int`, *optional*, defaults to `32`):
+ Sample input size
+ dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+ parameters `dtype`
+ """
+ in_channels: int = 3
+ out_channels: int = 3
+ down_block_types: Tuple[str] = ("DownEncoderBlock2D",)
+ up_block_types: Tuple[str] = ("UpDecoderBlock2D",)
+ block_out_channels: Tuple[int] = (64,)
+ layers_per_block: int = 1
+ act_fn: str = "silu"
+ latent_channels: int = 4
+ norm_num_groups: int = 32
+ sample_size: int = 32
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ self.encoder = FlaxEncoder(
+ in_channels=self.config.in_channels,
+ out_channels=self.config.latent_channels,
+ down_block_types=self.config.down_block_types,
+ block_out_channels=self.config.block_out_channels,
+ layers_per_block=self.config.layers_per_block,
+ act_fn=self.config.act_fn,
+ norm_num_groups=self.config.norm_num_groups,
+ double_z=True,
+ dtype=self.dtype,
+ )
+ self.decoder = FlaxDecoder(
+ in_channels=self.config.latent_channels,
+ out_channels=self.config.out_channels,
+ up_block_types=self.config.up_block_types,
+ block_out_channels=self.config.block_out_channels,
+ layers_per_block=self.config.layers_per_block,
+ norm_num_groups=self.config.norm_num_groups,
+ act_fn=self.config.act_fn,
+ dtype=self.dtype,
+ )
+ self.quant_conv = nn.Conv(
+ 2 * self.config.latent_channels,
+ kernel_size=(1, 1),
+ strides=(1, 1),
+ padding="VALID",
+ dtype=self.dtype,
+ )
+ self.post_quant_conv = nn.Conv(
+ self.config.latent_channels,
+ kernel_size=(1, 1),
+ strides=(1, 1),
+ padding="VALID",
+ dtype=self.dtype,
+ )
+
+ def init_weights(self, rng: jax.random.PRNGKey) -> FrozenDict:
+ # init input tensors
+ sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
+ sample = jnp.zeros(sample_shape, dtype=jnp.float32)
+
+ params_rng, dropout_rng, gaussian_rng = jax.random.split(rng, 3)
+ rngs = {"params": params_rng, "dropout": dropout_rng, "gaussian": gaussian_rng}
+
+ return self.init(rngs, sample)["params"]
+
+ def encode(self, sample, deterministic: bool = True, return_dict: bool = True):
+ sample = jnp.transpose(sample, (0, 2, 3, 1))
+
+ hidden_states = self.encoder(sample, deterministic=deterministic)
+ moments = self.quant_conv(hidden_states)
+ posterior = FlaxDiagonalGaussianDistribution(moments)
+
+ if not return_dict:
+ return (posterior,)
+
+ return FlaxAutoencoderKLOutput(latent_dist=posterior)
+
+ def decode(self, latents, deterministic: bool = True, return_dict: bool = True):
+ if latents.shape[-1] != self.config.latent_channels:
+ latents = jnp.transpose(latents, (0, 2, 3, 1))
+
+ hidden_states = self.post_quant_conv(latents)
+ hidden_states = self.decoder(hidden_states, deterministic=deterministic)
+
+ hidden_states = jnp.transpose(hidden_states, (0, 3, 1, 2))
+
+ if not return_dict:
+ return (hidden_states,)
+
+ return FlaxDecoderOutput(sample=hidden_states)
+
+ def __call__(self, sample, sample_posterior=False, deterministic: bool = True, return_dict: bool = True):
+ posterior = self.encode(sample, deterministic=deterministic, return_dict=return_dict)
+ if sample_posterior:
+ rng = self.make_rng("gaussian")
+ hidden_states = posterior.latent_dist.sample(rng)
+ else:
+ hidden_states = posterior.latent_dist.mode()
+
+ sample = self.decode(hidden_states, return_dict=return_dict).sample
+
+ if not return_dict:
+ return (sample,)
+
+ return FlaxDecoderOutput(sample=sample)
diff --git a/src/model/TextGen/diffusers/onnx_utils.py b/src/model/TextGen/diffusers/onnx_utils.py
new file mode 100644
index 0000000..b2c533e
--- /dev/null
+++ b/src/model/TextGen/diffusers/onnx_utils.py
@@ -0,0 +1,213 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import shutil
+from pathlib import Path
+from typing import Optional, Union
+
+import numpy as np
+
+from huggingface_hub import hf_hub_download
+
+from .utils import ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, is_onnx_available, logging
+
+
+if is_onnx_available():
+ import onnxruntime as ort
+
+
+logger = logging.get_logger(__name__)
+
+ORT_TO_NP_TYPE = {
+ "tensor(bool)": np.bool_,
+ "tensor(int8)": np.int8,
+ "tensor(uint8)": np.uint8,
+ "tensor(int16)": np.int16,
+ "tensor(uint16)": np.uint16,
+ "tensor(int32)": np.int32,
+ "tensor(uint32)": np.uint32,
+ "tensor(int64)": np.int64,
+ "tensor(uint64)": np.uint64,
+ "tensor(float16)": np.float16,
+ "tensor(float)": np.float32,
+ "tensor(double)": np.float64,
+}
+
+
+class OnnxRuntimeModel:
+ def __init__(self, model=None, **kwargs):
+ logger.info("`diffusers.OnnxRuntimeModel` is experimental and might change in the future.")
+ self.model = model
+ self.model_save_dir = kwargs.get("model_save_dir", None)
+ self.latest_model_name = kwargs.get("latest_model_name", ONNX_WEIGHTS_NAME)
+
+ def __call__(self, **kwargs):
+ inputs = {k: np.array(v) for k, v in kwargs.items()}
+ return self.model.run(None, inputs)
+
+ @staticmethod
+ def load_model(path: Union[str, Path], provider=None, sess_options=None):
+ """
+ Loads an ONNX Inference session with an ExecutionProvider. Default provider is `CPUExecutionProvider`
+
+ Arguments:
+ path (`str` or `Path`):
+ Directory from which to load
+ provider(`str`, *optional*):
+ Onnxruntime execution provider to use for loading the model, defaults to `CPUExecutionProvider`
+ """
+ if provider is None:
+ logger.info("No onnxruntime provider specified, using CPUExecutionProvider")
+ provider = "CPUExecutionProvider"
+
+ return ort.InferenceSession(path, providers=[provider], sess_options=sess_options)
+
+ def _save_pretrained(self, save_directory: Union[str, Path], file_name: Optional[str] = None, **kwargs):
+ """
+ Save a model and its configuration file to a directory, so that it can be re-loaded using the
+ [`~optimum.onnxruntime.modeling_ort.ORTModel.from_pretrained`] class method. It will always save the
+ latest_model_name.
+
+ Arguments:
+ save_directory (`str` or `Path`):
+ Directory where to save the model file.
+ file_name(`str`, *optional*):
+ Overwrites the default model file name from `"model.onnx"` to `file_name`. This allows you to save the
+ model with a different name.
+ """
+ model_file_name = file_name if file_name is not None else ONNX_WEIGHTS_NAME
+
+ src_path = self.model_save_dir.joinpath(self.latest_model_name)
+ dst_path = Path(save_directory).joinpath(model_file_name)
+ try:
+ shutil.copyfile(src_path, dst_path)
+ except shutil.SameFileError:
+ pass
+
+ # copy external weights (for models >2GB)
+ src_path = self.model_save_dir.joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
+ if src_path.exists():
+ dst_path = Path(save_directory).joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
+ try:
+ shutil.copyfile(src_path, dst_path)
+ except shutil.SameFileError:
+ pass
+
+ def save_pretrained(
+ self,
+ save_directory: Union[str, os.PathLike],
+ **kwargs,
+ ):
+ """
+ Save a model to a directory, so that it can be re-loaded using the [`~OnnxModel.from_pretrained`] class
+ method.:
+
+ Arguments:
+ save_directory (`str` or `os.PathLike`):
+ Directory to which to save. Will be created if it doesn't exist.
+ """
+ if os.path.isfile(save_directory):
+ logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+ return
+
+ os.makedirs(save_directory, exist_ok=True)
+
+ # saving model weights/files
+ self._save_pretrained(save_directory, **kwargs)
+
+ @classmethod
+ def _from_pretrained(
+ cls,
+ model_id: Union[str, Path],
+ use_auth_token: Optional[Union[bool, str, None]] = None,
+ revision: Optional[Union[str, None]] = None,
+ force_download: bool = False,
+ cache_dir: Optional[str] = None,
+ file_name: Optional[str] = None,
+ provider: Optional[str] = None,
+ sess_options: Optional["ort.SessionOptions"] = None,
+ **kwargs,
+ ):
+ """
+ Load a model from a directory or the HF Hub.
+
+ Arguments:
+ model_id (`str` or `Path`):
+ Directory from which to load
+ use_auth_token (`str` or `bool`):
+ Is needed to load models from a private or gated repository
+ revision (`str`):
+ Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id
+ cache_dir (`Union[str, Path]`, *optional*):
+ Path to a directory in which a downloaded pretrained model configuration should be cached if the
+ standard cache should not be used.
+ force_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+ cached versions if they exist.
+ file_name(`str`):
+ Overwrites the default model file name from `"model.onnx"` to `file_name`. This allows you to load
+ different model files from the same repository or directory.
+ provider(`str`):
+ The ONNX runtime provider, e.g. `CPUExecutionProvider` or `CUDAExecutionProvider`.
+ kwargs (`Dict`, *optional*):
+ kwargs will be passed to the model during initialization
+ """
+ model_file_name = file_name if file_name is not None else ONNX_WEIGHTS_NAME
+ # load model from local directory
+ if os.path.isdir(model_id):
+ model = OnnxRuntimeModel.load_model(
+ os.path.join(model_id, model_file_name), provider=provider, sess_options=sess_options
+ )
+ kwargs["model_save_dir"] = Path(model_id)
+ # load model from hub
+ else:
+ # download model
+ model_cache_path = hf_hub_download(
+ repo_id=model_id,
+ filename=model_file_name,
+ use_auth_token=use_auth_token,
+ revision=revision,
+ cache_dir=cache_dir,
+ force_download=force_download,
+ )
+ kwargs["model_save_dir"] = Path(model_cache_path).parent
+ kwargs["latest_model_name"] = Path(model_cache_path).name
+ model = OnnxRuntimeModel.load_model(model_cache_path, provider=provider, sess_options=sess_options)
+ return cls(model=model, **kwargs)
+
+ @classmethod
+ def from_pretrained(
+ cls,
+ model_id: Union[str, Path],
+ force_download: bool = True,
+ use_auth_token: Optional[str] = None,
+ cache_dir: Optional[str] = None,
+ **model_kwargs,
+ ):
+ revision = None
+ if len(str(model_id).split("@")) == 2:
+ model_id, revision = model_id.split("@")
+
+ return cls._from_pretrained(
+ model_id=model_id,
+ revision=revision,
+ cache_dir=cache_dir,
+ force_download=force_download,
+ use_auth_token=use_auth_token,
+ **model_kwargs,
+ )
diff --git a/src/model/TextGen/diffusers/optimization.py b/src/model/TextGen/diffusers/optimization.py
new file mode 100644
index 0000000..e7b836b
--- /dev/null
+++ b/src/model/TextGen/diffusers/optimization.py
@@ -0,0 +1,275 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for diffusion models."""
+
+import math
+from enum import Enum
+from typing import Optional, Union
+
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR
+
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class SchedulerType(Enum):
+ LINEAR = "linear"
+ COSINE = "cosine"
+ COSINE_WITH_RESTARTS = "cosine_with_restarts"
+ POLYNOMIAL = "polynomial"
+ CONSTANT = "constant"
+ CONSTANT_WITH_WARMUP = "constant_with_warmup"
+
+
+def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
+ """
+ Create a schedule with a constant learning rate, using the learning rate set in optimizer.
+
+ Args:
+ optimizer ([`~torch.optim.Optimizer`]):
+ The optimizer for which to schedule the learning rate.
+ last_epoch (`int`, *optional*, defaults to -1):
+ The index of the last epoch when resuming training.
+
+ Return:
+ `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+ """
+ return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
+
+
+def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1):
+ """
+ Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
+ increases linearly between 0 and the initial lr set in the optimizer.
+
+ Args:
+ optimizer ([`~torch.optim.Optimizer`]):
+ The optimizer for which to schedule the learning rate.
+ num_warmup_steps (`int`):
+ The number of steps for the warmup phase.
+ last_epoch (`int`, *optional*, defaults to -1):
+ The index of the last epoch when resuming training.
+
+ Return:
+ `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+ """
+
+ def lr_lambda(current_step: int):
+ if current_step < num_warmup_steps:
+ return float(current_step) / float(max(1.0, num_warmup_steps))
+ return 1.0
+
+ return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
+
+
+def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
+ """
+ Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
+ a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
+
+ Args:
+ optimizer ([`~torch.optim.Optimizer`]):
+ The optimizer for which to schedule the learning rate.
+ num_warmup_steps (`int`):
+ The number of steps for the warmup phase.
+ num_training_steps (`int`):
+ The total number of training steps.
+ last_epoch (`int`, *optional*, defaults to -1):
+ The index of the last epoch when resuming training.
+
+ Return:
+ `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+ """
+
+ def lr_lambda(current_step: int):
+ if current_step < num_warmup_steps:
+ return float(current_step) / float(max(1, num_warmup_steps))
+ return max(
+ 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
+ )
+
+ return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def get_cosine_schedule_with_warmup(
+ optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
+):
+ """
+ Create a schedule with a learning rate that decreases following the values of the cosine function between the
+ initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+ initial lr set in the optimizer.
+
+ Args:
+ optimizer ([`~torch.optim.Optimizer`]):
+ The optimizer for which to schedule the learning rate.
+ num_warmup_steps (`int`):
+ The number of steps for the warmup phase.
+ num_training_steps (`int`):
+ The total number of training steps.
+ num_cycles (`float`, *optional*, defaults to 0.5):
+ The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+ following a half-cosine).
+ last_epoch (`int`, *optional*, defaults to -1):
+ The index of the last epoch when resuming training.
+
+ Return:
+ `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+ """
+
+ def lr_lambda(current_step):
+ if current_step < num_warmup_steps:
+ return float(current_step) / float(max(1, num_warmup_steps))
+ progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+ return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+
+ return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def get_cosine_with_hard_restarts_schedule_with_warmup(
+ optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1
+):
+ """
+ Create a schedule with a learning rate that decreases following the values of the cosine function between the
+ initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
+ linearly between 0 and the initial lr set in the optimizer.
+
+ Args:
+ optimizer ([`~torch.optim.Optimizer`]):
+ The optimizer for which to schedule the learning rate.
+ num_warmup_steps (`int`):
+ The number of steps for the warmup phase.
+ num_training_steps (`int`):
+ The total number of training steps.
+ num_cycles (`int`, *optional*, defaults to 1):
+ The number of hard restarts to use.
+ last_epoch (`int`, *optional*, defaults to -1):
+ The index of the last epoch when resuming training.
+
+ Return:
+ `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+ """
+
+ def lr_lambda(current_step):
+ if current_step < num_warmup_steps:
+ return float(current_step) / float(max(1, num_warmup_steps))
+ progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+ if progress >= 1.0:
+ return 0.0
+ return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
+
+ return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def get_polynomial_decay_schedule_with_warmup(
+ optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
+):
+ """
+ Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
+ optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the
+ initial lr set in the optimizer.
+
+ Args:
+ optimizer ([`~torch.optim.Optimizer`]):
+ The optimizer for which to schedule the learning rate.
+ num_warmup_steps (`int`):
+ The number of steps for the warmup phase.
+ num_training_steps (`int`):
+ The total number of training steps.
+ lr_end (`float`, *optional*, defaults to 1e-7):
+ The end LR.
+ power (`float`, *optional*, defaults to 1.0):
+ Power factor.
+ last_epoch (`int`, *optional*, defaults to -1):
+ The index of the last epoch when resuming training.
+
+ Note: *power* defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
+ implementation at
+ https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
+
+ Return:
+ `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+
+ """
+
+ lr_init = optimizer.defaults["lr"]
+ if not (lr_init > lr_end):
+ raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
+
+ def lr_lambda(current_step: int):
+ if current_step < num_warmup_steps:
+ return float(current_step) / float(max(1, num_warmup_steps))
+ elif current_step > num_training_steps:
+ return lr_end / lr_init # as LambdaLR multiplies by lr_init
+ else:
+ lr_range = lr_init - lr_end
+ decay_steps = num_training_steps - num_warmup_steps
+ pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
+ decay = lr_range * pct_remaining**power + lr_end
+ return decay / lr_init # as LambdaLR multiplies by lr_init
+
+ return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+TYPE_TO_SCHEDULER_FUNCTION = {
+ SchedulerType.LINEAR: get_linear_schedule_with_warmup,
+ SchedulerType.COSINE: get_cosine_schedule_with_warmup,
+ SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
+ SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
+ SchedulerType.CONSTANT: get_constant_schedule,
+ SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
+}
+
+
+def get_scheduler(
+ name: Union[str, SchedulerType],
+ optimizer: Optimizer,
+ num_warmup_steps: Optional[int] = None,
+ num_training_steps: Optional[int] = None,
+):
+ """
+ Unified API to get any scheduler from its name.
+
+ Args:
+ name (`str` or `SchedulerType`):
+ The name of the scheduler to use.
+ optimizer (`torch.optim.Optimizer`):
+ The optimizer that will be used during training.
+ num_warmup_steps (`int`, *optional*):
+ The number of warmup steps to do. This is not required by all schedulers (hence the argument being
+ optional), the function will raise an error if it's unset and the scheduler type requires it.
+ num_training_steps (`int``, *optional*):
+ The number of training steps to do. This is not required by all schedulers (hence the argument being
+ optional), the function will raise an error if it's unset and the scheduler type requires it.
+ """
+ name = SchedulerType(name)
+ schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+ if name == SchedulerType.CONSTANT:
+ return schedule_func(optimizer)
+
+ # All other schedulers require `num_warmup_steps`
+ if num_warmup_steps is None:
+ raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+
+ if name == SchedulerType.CONSTANT_WITH_WARMUP:
+ return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
+
+ # All other schedulers require `num_training_steps`
+ if num_training_steps is None:
+ raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+
+ return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
diff --git a/src/model/TextGen/diffusers/pipeline_flax_utils.py b/src/model/TextGen/diffusers/pipeline_flax_utils.py
new file mode 100644
index 0000000..4c34e64
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipeline_flax_utils.py
@@ -0,0 +1,506 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+import os
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+import flax
+import PIL
+from flax.core.frozen_dict import FrozenDict
+from huggingface_hub import snapshot_download
+from PIL import Image
+from tqdm.auto import tqdm
+
+from .configuration_utils import ConfigMixin
+from .hub_utils import http_user_agent
+from .modeling_flax_utils import FLAX_WEIGHTS_NAME, FlaxModelMixin
+from .schedulers.scheduling_utils_flax import SCHEDULER_CONFIG_NAME, FlaxSchedulerMixin
+from .utils import CONFIG_NAME, DIFFUSERS_CACHE, BaseOutput, is_transformers_available, logging
+
+
+if is_transformers_available():
+ from transformers import FlaxPreTrainedModel
+
+INDEX_FILE = "diffusion_flax_model.bin"
+
+
+logger = logging.get_logger(__name__)
+
+
+LOADABLE_CLASSES = {
+ "diffusers": {
+ "FlaxModelMixin": ["save_pretrained", "from_pretrained"],
+ "FlaxSchedulerMixin": ["save_config", "from_config"],
+ "FlaxDiffusionPipeline": ["save_pretrained", "from_pretrained"],
+ },
+ "transformers": {
+ "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
+ "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
+ "FlaxPreTrainedModel": ["save_pretrained", "from_pretrained"],
+ "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"],
+ "ProcessorMixin": ["save_pretrained", "from_pretrained"],
+ "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
+ },
+}
+
+ALL_IMPORTABLE_CLASSES = {}
+for library in LOADABLE_CLASSES:
+ ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library])
+
+
+def import_flax_or_no_model(module, class_name):
+ try:
+ # 1. First make sure that if a Flax object is present, import this one
+ class_obj = getattr(module, "Flax" + class_name)
+ except AttributeError:
+ # 2. If this doesn't work, it's not a model and we don't append "Flax"
+ class_obj = getattr(module, class_name)
+ except AttributeError:
+ raise ValueError(f"Neither Flax{class_name} nor {class_name} exist in {module}")
+
+ return class_obj
+
+
+@flax.struct.dataclass
+class FlaxImagePipelineOutput(BaseOutput):
+ """
+ Output class for image pipelines.
+
+ Args:
+ images (`List[PIL.Image.Image]` or `np.ndarray`)
+ List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+ num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+ """
+
+ images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+class FlaxDiffusionPipeline(ConfigMixin):
+ r"""
+ Base class for all models.
+
+ [`FlaxDiffusionPipeline`] takes care of storing all components (models, schedulers, processors) for diffusion
+ pipelines and handles methods for loading, downloading and saving models as well as a few methods common to all
+ pipelines to:
+
+ - enabling/disabling the progress bar for the denoising iteration
+
+ Class attributes:
+
+ - **config_name** ([`str`]) -- name of the config file that will store the class and module names of all
+ components of the diffusion pipeline.
+ """
+ config_name = "model_index.json"
+
+ def register_modules(self, **kwargs):
+ # import it here to avoid circular import
+ from diffusers import pipelines
+
+ for name, module in kwargs.items():
+ if module is None:
+ register_dict = {name: (None, None)}
+ else:
+ # retrieve library
+ library = module.__module__.split(".")[0]
+
+ # check if the module is a pipeline module
+ pipeline_dir = module.__module__.split(".")[-2]
+ path = module.__module__.split(".")
+ is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
+
+ # if library is not in LOADABLE_CLASSES, then it is a custom module.
+ # Or if it's a pipeline module, then the module is inside the pipeline
+ # folder so we set the library to module name.
+ if library not in LOADABLE_CLASSES or is_pipeline_module:
+ library = pipeline_dir
+
+ # retrieve class_name
+ class_name = module.__class__.__name__
+
+ register_dict = {name: (library, class_name)}
+
+ # save model index config
+ self.register_to_config(**register_dict)
+
+ # set models
+ setattr(self, name, module)
+
+ def save_pretrained(self, save_directory: Union[str, os.PathLike], params: Union[Dict, FrozenDict]):
+ # TODO: handle inference_state
+ """
+ Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to
+ a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading
+ method. The pipeline can easily be re-loaded using the `[`~FlaxDiffusionPipeline.from_pretrained`]` class
+ method.
+
+ Arguments:
+ save_directory (`str` or `os.PathLike`):
+ Directory to which to save. Will be created if it doesn't exist.
+ """
+ self.save_config(save_directory)
+
+ model_index_dict = dict(self.config)
+ model_index_dict.pop("_class_name")
+ model_index_dict.pop("_diffusers_version")
+ model_index_dict.pop("_module", None)
+
+ for pipeline_component_name in model_index_dict.keys():
+ sub_model = getattr(self, pipeline_component_name)
+ if sub_model is None:
+ # edge case for saving a pipeline with safety_checker=None
+ continue
+
+ model_cls = sub_model.__class__
+
+ save_method_name = None
+ # search for the model's base class in LOADABLE_CLASSES
+ for library_name, library_classes in LOADABLE_CLASSES.items():
+ library = importlib.import_module(library_name)
+ for base_class, save_load_methods in library_classes.items():
+ class_candidate = getattr(library, base_class, None)
+ if class_candidate is not None and issubclass(model_cls, class_candidate):
+ # if we found a suitable base class in LOADABLE_CLASSES then grab its save method
+ save_method_name = save_load_methods[0]
+ break
+ if save_method_name is not None:
+ break
+
+ save_method = getattr(sub_model, save_method_name)
+ expects_params = "params" in set(inspect.signature(save_method).parameters.keys())
+
+ if expects_params:
+ save_method(
+ os.path.join(save_directory, pipeline_component_name), params=params[pipeline_component_name]
+ )
+ else:
+ save_method(os.path.join(save_directory, pipeline_component_name))
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+ r"""
+ Instantiate a Flax diffusion pipeline from pre-trained pipeline weights.
+
+ The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
+
+ The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
+ pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+ task.
+
+ The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
+ weights are discarded.
+
+ Parameters:
+ pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+ Can be either:
+
+ - A string, the *repo id* of a pretrained pipeline hosted inside a model repo on
+ https://huggingface.co/ Valid repo ids have to be located under a user or organization name, like
+ `CompVis/ldm-text2im-large-256`.
+ - A path to a *directory* containing pipeline weights saved using
+ [`~FlaxDiffusionPipeline.save_pretrained`], e.g., `./my_pipeline_directory/`.
+ dtype (`str` or `jnp.dtype`, *optional*):
+ Override the default `jnp.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+ will be automatically derived from the model's weights.
+ force_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+ cached versions if they exist.
+ resume_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+ file exists.
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+ 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+ output_loading_info(`bool`, *optional*, defaults to `False`):
+ Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+ local_files_only(`bool`, *optional*, defaults to `False`):
+ Whether or not to only look at local files (i.e., do not try to download the model).
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+ when running `huggingface-cli login` (stored in `~/.huggingface`).
+ revision (`str`, *optional*, defaults to `"main"`):
+ The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+ identifier allowed by git.
+ mirror (`str`, *optional*):
+ Mirror source to accelerate downloads in China. If you are from China and have an accessibility
+ problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
+ Please refer to the mirror site for more information. specify the folder name here.
+
+ kwargs (remaining dictionary of keyword arguments, *optional*):
+ Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the
+ specific pipeline class. The overwritten components are then directly passed to the pipelines
+ `__init__` method. See example below for more information.
+
+
+
+ It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
+ models](https://huggingface.co/docs/hub/models-gated#gated-models), *e.g.* `"runwayml/stable-diffusion-v1-5"`
+
+
+
+
+
+ Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use
+ this method in a firewalled environment.
+
+
+
+ Examples:
+
+ ```py
+ >>> from diffusers import FlaxDiffusionPipeline
+
+ >>> # Download pipeline from huggingface.co and cache.
+ >>> # Requires to be logged in to Hugging Face hub,
+ >>> # see more in [the documentation](https://huggingface.co/docs/hub/security-tokens)
+ >>> pipeline, params = FlaxDiffusionPipeline.from_pretrained(
+ ... "runwayml/stable-diffusion-v1-5",
+ ... revision="bf16",
+ ... dtype=jnp.bfloat16,
+ ... )
+
+ >>> # Download pipeline, but use a different scheduler
+ >>> from diffusers import FlaxDPMSolverMultistepScheduler
+
+ >>> model_id = "runwayml/stable-diffusion-v1-5"
+ >>> sched, sched_state = FlaxDPMSolverMultistepScheduler.from_config(
+ ... model_id,
+ ... subfolder="scheduler",
+ ... )
+
+ >>> dpm_pipe, dpm_params = FlaxStableDiffusionPipeline.from_pretrained(
+ ... model_id, revision="bf16", dtype=jnp.bfloat16, scheduler=dpmpp
+ ... )
+ >>> dpm_params["scheduler"] = dpmpp_state
+ ```
+ """
+ cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+ resume_download = kwargs.pop("resume_download", False)
+ proxies = kwargs.pop("proxies", None)
+ local_files_only = kwargs.pop("local_files_only", False)
+ use_auth_token = kwargs.pop("use_auth_token", None)
+ revision = kwargs.pop("revision", None)
+ from_pt = kwargs.pop("from_pt", False)
+ dtype = kwargs.pop("dtype", None)
+
+ # 1. Download the checkpoints and configs
+ # use snapshot download here to get it working from from_pretrained
+ if not os.path.isdir(pretrained_model_name_or_path):
+ config_dict = cls.get_config_dict(
+ pretrained_model_name_or_path,
+ cache_dir=cache_dir,
+ resume_download=resume_download,
+ proxies=proxies,
+ local_files_only=local_files_only,
+ use_auth_token=use_auth_token,
+ revision=revision,
+ )
+ # make sure we only download sub-folders and `diffusers` filenames
+ folder_names = [k for k in config_dict.keys() if not k.startswith("_")]
+ allow_patterns = [os.path.join(k, "*") for k in folder_names]
+ allow_patterns += [FLAX_WEIGHTS_NAME, SCHEDULER_CONFIG_NAME, CONFIG_NAME, cls.config_name]
+
+ # make sure we don't download PyTorch weights
+ ignore_patterns = "*.bin"
+
+ if cls != FlaxDiffusionPipeline:
+ requested_pipeline_class = cls.__name__
+ else:
+ requested_pipeline_class = config_dict.get("_class_name", cls.__name__)
+ requested_pipeline_class = (
+ requested_pipeline_class
+ if requested_pipeline_class.startswith("Flax")
+ else "Flax" + requested_pipeline_class
+ )
+
+ user_agent = {"pipeline_class": requested_pipeline_class}
+ user_agent = http_user_agent(user_agent)
+
+ # download all allow_patterns
+ cached_folder = snapshot_download(
+ pretrained_model_name_or_path,
+ cache_dir=cache_dir,
+ resume_download=resume_download,
+ proxies=proxies,
+ local_files_only=local_files_only,
+ use_auth_token=use_auth_token,
+ revision=revision,
+ allow_patterns=allow_patterns,
+ ignore_patterns=ignore_patterns,
+ user_agent=user_agent,
+ )
+ else:
+ cached_folder = pretrained_model_name_or_path
+
+ config_dict = cls.get_config_dict(cached_folder)
+
+ # 2. Load the pipeline class, if using custom module then load it from the hub
+ # if we load from explicit class, let's use it
+ if cls != FlaxDiffusionPipeline:
+ pipeline_class = cls
+ else:
+ diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
+ class_name = (
+ config_dict["_class_name"]
+ if config_dict["_class_name"].startswith("Flax")
+ else "Flax" + config_dict["_class_name"]
+ )
+ pipeline_class = getattr(diffusers_module, class_name)
+
+ # some modules can be passed directly to the init
+ # in this case they are already instantiated in `kwargs`
+ # extract them here
+ expected_modules = set(inspect.signature(pipeline_class.__init__).parameters.keys())
+ passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+
+ init_dict, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
+
+ init_kwargs = {}
+
+ # inference_params
+ params = {}
+
+ # import it here to avoid circular import
+ from diffusers import pipelines
+
+ # 3. Load each module in the pipeline
+ for name, (library_name, class_name) in init_dict.items():
+ if class_name is None:
+ # edge case for when the pipeline was saved with safety_checker=None
+ init_kwargs[name] = None
+ continue
+
+ is_pipeline_module = hasattr(pipelines, library_name)
+ loaded_sub_model = None
+ sub_model_should_be_defined = True
+
+ # if the model is in a pipeline module, then we load it from the pipeline
+ if name in passed_class_obj:
+ # 1. check that passed_class_obj has correct parent class
+ if not is_pipeline_module:
+ library = importlib.import_module(library_name)
+ class_obj = getattr(library, class_name)
+ importable_classes = LOADABLE_CLASSES[library_name]
+ class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+
+ expected_class_obj = None
+ for class_name, class_candidate in class_candidates.items():
+ if class_candidate is not None and issubclass(class_obj, class_candidate):
+ expected_class_obj = class_candidate
+
+ if not issubclass(passed_class_obj[name].__class__, expected_class_obj):
+ raise ValueError(
+ f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be"
+ f" {expected_class_obj}"
+ )
+ elif passed_class_obj[name] is None:
+ logger.warn(
+ f"You have passed `None` for {name} to disable its functionality in {pipeline_class}. Note"
+ f" that this might lead to problems when using {pipeline_class} and is not recommended."
+ )
+ sub_model_should_be_defined = False
+ else:
+ logger.warn(
+ f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
+ " has the correct type"
+ )
+
+ # set passed class object
+ loaded_sub_model = passed_class_obj[name]
+ elif is_pipeline_module:
+ pipeline_module = getattr(pipelines, library_name)
+ class_obj = import_flax_or_no_model(pipeline_module, class_name)
+
+ importable_classes = ALL_IMPORTABLE_CLASSES
+ class_candidates = {c: class_obj for c in importable_classes.keys()}
+ else:
+ # else we just import it from the library.
+ library = importlib.import_module(library_name)
+ class_obj = import_flax_or_no_model(library, class_name)
+
+ importable_classes = LOADABLE_CLASSES[library_name]
+ class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+
+ if loaded_sub_model is None and sub_model_should_be_defined:
+ load_method_name = None
+ for class_name, class_candidate in class_candidates.items():
+ if class_candidate is not None and issubclass(class_obj, class_candidate):
+ load_method_name = importable_classes[class_name][1]
+
+ load_method = getattr(class_obj, load_method_name)
+
+ # check if the module is in a subdirectory
+ if os.path.isdir(os.path.join(cached_folder, name)):
+ loadable_folder = os.path.join(cached_folder, name)
+ else:
+ loaded_sub_model = cached_folder
+
+ if issubclass(class_obj, FlaxModelMixin):
+ loaded_sub_model, loaded_params = load_method(loadable_folder, from_pt=from_pt, dtype=dtype)
+ params[name] = loaded_params
+ elif is_transformers_available() and issubclass(class_obj, FlaxPreTrainedModel):
+ if from_pt:
+ # TODO(Suraj): Fix this in Transformers. We should be able to use `_do_init=False` here
+ loaded_sub_model = load_method(loadable_folder, from_pt=from_pt)
+ loaded_params = loaded_sub_model.params
+ del loaded_sub_model._params
+ else:
+ loaded_sub_model, loaded_params = load_method(loadable_folder, _do_init=False)
+ params[name] = loaded_params
+ elif issubclass(class_obj, FlaxSchedulerMixin):
+ loaded_sub_model, scheduler_state = load_method(loadable_folder)
+ params[name] = scheduler_state
+ else:
+ loaded_sub_model = load_method(loadable_folder)
+
+ init_kwargs[name] = loaded_sub_model # UNet(...), # DiffusionSchedule(...)
+
+ model = pipeline_class(**init_kwargs, dtype=dtype)
+ return model, params
+
+ @staticmethod
+ def numpy_to_pil(images):
+ """
+ Convert a numpy image or a batch of images to a PIL image.
+ """
+ if images.ndim == 3:
+ images = images[None, ...]
+ images = (images * 255).round().astype("uint8")
+ if images.shape[-1] == 1:
+ # special case for grayscale (single channel) images
+ pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+ else:
+ pil_images = [Image.fromarray(image) for image in images]
+
+ return pil_images
+
+ # TODO: make it compatible with jax.lax
+ def progress_bar(self, iterable):
+ if not hasattr(self, "_progress_bar_config"):
+ self._progress_bar_config = {}
+ elif not isinstance(self._progress_bar_config, dict):
+ raise ValueError(
+ f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+ )
+
+ return tqdm(iterable, **self._progress_bar_config)
+
+ def set_progress_bar_config(self, **kwargs):
+ self._progress_bar_config = kwargs
diff --git a/src/model/TextGen/diffusers/pipeline_utils.py b/src/model/TextGen/diffusers/pipeline_utils.py
new file mode 100644
index 0000000..f4d1de9
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipeline_utils.py
@@ -0,0 +1,742 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+
+# import diffusers
+import PIL
+from huggingface_hub import snapshot_download
+from packaging import version
+from PIL import Image
+from tqdm.auto import tqdm
+
+from .configuration_utils import ConfigMixin
+from .dynamic_modules_utils import get_class_from_dynamic_module
+from .hub_utils import http_user_agent
+from .modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT
+from .schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
+from .utils import (
+ CONFIG_NAME,
+ DIFFUSERS_CACHE,
+ ONNX_WEIGHTS_NAME,
+ WEIGHTS_NAME,
+ BaseOutput,
+ deprecate,
+ is_accelerate_available,
+ is_torch_version,
+ is_transformers_available,
+ logging,
+)
+
+
+if is_transformers_available():
+ import transformers
+ from transformers import PreTrainedModel
+
+
+INDEX_FILE = "diffusion_pytorch_model.bin"
+CUSTOM_PIPELINE_FILE_NAME = "pipeline.py"
+DUMMY_MODULES_FOLDER = "diffusers.utils"
+
+
+logger = logging.get_logger(__name__)
+
+
+LOADABLE_CLASSES = {
+ "diffusers": {
+ "ModelMixin": ["save_pretrained", "from_pretrained"],
+ "SchedulerMixin": ["save_config", "from_config"],
+ "DiffusionPipeline": ["save_pretrained", "from_pretrained"],
+ "OnnxRuntimeModel": ["save_pretrained", "from_pretrained"],
+ },
+ "transformers": {
+ "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
+ "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
+ "PreTrainedModel": ["save_pretrained", "from_pretrained"],
+ "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"],
+ "ProcessorMixin": ["save_pretrained", "from_pretrained"],
+ "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
+ },
+ # make inpaint module loadable here
+ "inpaint": {
+ "ModelMixin": ["save_pretrained", "from_pretrained"],
+ "DiffusionPipeline": ["save_pretrained", "from_pretrained"],
+ "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
+ "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
+ "PreTrainedModel": ["save_pretrained", "from_pretrained"],
+ }
+}
+
+ALL_IMPORTABLE_CLASSES = {}
+for library in LOADABLE_CLASSES:
+ ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library])
+
+
+@dataclass
+class ImagePipelineOutput(BaseOutput):
+ """
+ Output class for image pipelines.
+
+ Args:
+ images (`List[PIL.Image.Image]` or `np.ndarray`)
+ List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+ num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+ """
+
+ images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+@dataclass
+class AudioPipelineOutput(BaseOutput):
+ """
+ Output class for audio pipelines.
+
+ Args:
+ audios (`np.ndarray`)
+ List of denoised samples of shape `(batch_size, num_channels, sample_rate)`. Numpy array present the
+ denoised audio samples of the diffusion pipeline.
+ """
+
+ audios: np.ndarray
+
+
+class DiffusionPipeline(ConfigMixin):
+ r"""
+ Base class for all models.
+
+ [`DiffusionPipeline`] takes care of storing all components (models, schedulers, processors) for diffusion pipelines
+ and handles methods for loading, downloading and saving models as well as a few methods common to all pipelines to:
+
+ - move all PyTorch modules to the device of your choice
+ - enabling/disabling the progress bar for the denoising iteration
+
+ Class attributes:
+
+ - **config_name** ([`str`]) -- name of the config file that will store the class and module names of all
+ components of the diffusion pipeline.
+ """
+ config_name = "model_index.json"
+
+ def register_modules(self, **kwargs):
+ # import it here to avoid circular import
+ from diffusers import pipelines
+
+ for name, module in kwargs.items():
+ # retrieve library
+ if module is None:
+ register_dict = {name: (None, None)}
+ else:
+ library = module.__module__.split(".")[0]
+
+ # check if the module is a pipeline module
+ pipeline_dir = module.__module__.split(
+ ".")[-2] if len(module.__module__.split(".")) > 2 else None
+ path = module.__module__.split(".")
+ is_pipeline_module = pipeline_dir in path and hasattr(
+ pipelines, pipeline_dir)
+
+ # if library is not in LOADABLE_CLASSES, then it is a custom module.
+ # Or if it's a pipeline module, then the module is inside the pipeline
+ # folder so we set the library to module name.
+ if library not in LOADABLE_CLASSES or is_pipeline_module:
+ library = pipeline_dir
+
+ # retrieve class_name
+ class_name = module.__class__.__name__
+
+ register_dict = {name: (library, class_name)}
+
+ # save model index config
+ self.register_to_config(**register_dict)
+
+ # set models
+ setattr(self, name, module)
+
+ def save_pretrained(self, save_directory: Union[str, os.PathLike]):
+ """
+ Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to
+ a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading
+ method. The pipeline can easily be re-loaded using the `[`~DiffusionPipeline.from_pretrained`]` class method.
+
+ Arguments:
+ save_directory (`str` or `os.PathLike`):
+ Directory to which to save. Will be created if it doesn't exist.
+ """
+ self.save_config(save_directory)
+
+ model_index_dict = dict(self.config)
+ model_index_dict.pop("_class_name")
+ model_index_dict.pop("_diffusers_version")
+ model_index_dict.pop("_module", None)
+
+ for pipeline_component_name in model_index_dict.keys():
+ sub_model = getattr(self, pipeline_component_name)
+ if sub_model is None:
+ # edge case for saving a pipeline with safety_checker=None
+ continue
+
+ model_cls = sub_model.__class__
+
+ save_method_name = None
+ # search for the model's base class in LOADABLE_CLASSES
+ for library_name, library_classes in LOADABLE_CLASSES.items():
+ library = importlib.import_module(library_name)
+ for base_class, save_load_methods in library_classes.items():
+ class_candidate = getattr(library, base_class, None)
+ if class_candidate is not None and issubclass(model_cls, class_candidate):
+ # if we found a suitable base class in LOADABLE_CLASSES then grab its save method
+ save_method_name = save_load_methods[0]
+ break
+ if save_method_name is not None:
+ break
+
+ save_method = getattr(sub_model, save_method_name)
+ save_method(os.path.join(save_directory, pipeline_component_name))
+
+ def to(self, torch_device: Optional[Union[str, torch.device]] = None):
+ if torch_device is None:
+ return self
+
+ module_names, _ = self.extract_init_dict(dict(self.config))
+ for name in module_names.keys():
+ module = getattr(self, name)
+ if isinstance(module, torch.nn.Module):
+ if module.dtype == torch.float16 and str(torch_device) in ["cpu"]:
+ logger.warning(
+ "Pipelines loaded with `torch_dtype=torch.float16` cannot run with `cpu` device. It"
+ " is not recommended to move them to `cpu` as running them will fail. Please make"
+ " sure to use an accelerator to run the pipeline in inference, due to the lack of"
+ " support for`float16` operations on this device in PyTorch. Please, remove the"
+ " `torch_dtype=torch.float16` argument, or use another device for inference."
+ )
+ module.to(torch_device)
+ return self
+
+ @property
+ def device(self) -> torch.device:
+ r"""
+ Returns:
+ `torch.device`: The torch device on which the pipeline is located.
+ """
+ module_names, _ = self.extract_init_dict(dict(self.config))
+ for name in module_names.keys():
+ module = getattr(self, name)
+ if isinstance(module, torch.nn.Module):
+ return module.device
+ return torch.device("cpu")
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+ r"""
+ Instantiate a PyTorch diffusion pipeline from pre-trained pipeline weights.
+
+ The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
+
+ The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
+ pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+ task.
+
+ The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
+ weights are discarded.
+
+ Parameters:
+ pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+ Can be either:
+
+ - A string, the *repo id* of a pretrained pipeline hosted inside a model repo on
+ https://huggingface.co/ Valid repo ids have to be located under a user or organization name, like
+ `CompVis/ldm-text2im-large-256`.
+ - A path to a *directory* containing pipeline weights saved using
+ [`~DiffusionPipeline.save_pretrained`], e.g., `./my_pipeline_directory/`.
+ torch_dtype (`str` or `torch.dtype`, *optional*):
+ Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+ will be automatically derived from the model's weights.
+ custom_pipeline (`str`, *optional*):
+
+
+
+ This is an experimental feature and is likely to change in the future.
+
+
+
+ Can be either:
+
+ - A string, the *repo id* of a custom pipeline hosted inside a model repo on
+ https://huggingface.co/. Valid repo ids have to be located under a user or organization name,
+ like `hf-internal-testing/diffusers-dummy-pipeline`.
+
+
+
+ It is required that the model repo has a file, called `pipeline.py` that defines the custom
+ pipeline.
+
+
+
+ - A string, the *file name* of a community pipeline hosted on GitHub under
+ https://github.com/huggingface/diffusers/tree/main/examples/community. Valid file names have to
+ match exactly the file name without `.py` located under the above link, *e.g.*
+ `clip_guided_stable_diffusion`.
+
+
+
+ Community pipelines are always loaded from the current `main` branch of GitHub.
+
+
+
+ - A path to a *directory* containing a custom pipeline, e.g., `./my_pipeline_directory/`.
+
+
+
+ It is required that the directory has a file, called `pipeline.py` that defines the custom
+ pipeline.
+
+
+
+ For more information on how to load and create custom pipelines, please have a look at [Loading and
+ Adding Custom
+ Pipelines](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview)
+
+ torch_dtype (`str` or `torch.dtype`, *optional*):
+ force_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+ cached versions if they exist.
+ resume_download (`bool`, *optional*, defaults to `False`):
+ Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+ file exists.
+ proxies (`Dict[str, str]`, *optional*):
+ A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+ 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+ output_loading_info(`bool`, *optional*, defaults to `False`):
+ Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+ local_files_only(`bool`, *optional*, defaults to `False`):
+ Whether or not to only look at local files (i.e., do not try to download the model).
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+ when running `huggingface-cli login` (stored in `~/.huggingface`).
+ revision (`str`, *optional*, defaults to `"main"`):
+ The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+ git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+ identifier allowed by git.
+ mirror (`str`, *optional*):
+ Mirror source to accelerate downloads in China. If you are from China and have an accessibility
+ problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
+ Please refer to the mirror site for more information. specify the folder name here.
+ device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+ A map that specifies where each submodule should go. It doesn't need to be refined to each
+ parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+ same device.
+
+ To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+ more information about each option see [designing a device
+ map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+ low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+ Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
+ also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
+ model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
+ setting this argument to `True` will raise an error.
+
+ kwargs (remaining dictionary of keyword arguments, *optional*):
+ Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the
+ specific pipeline class. The overwritten components are then directly passed to the pipelines
+ `__init__` method. See example below for more information.
+
+
+
+ It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
+ models](https://huggingface.co/docs/hub/models-gated#gated-models), *e.g.* `"runwayml/stable-diffusion-v1-5"`
+
+
+
+
+
+ Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use
+ this method in a firewalled environment.
+
+
+
+ Examples:
+
+ ```py
+ >>> from diffusers import DiffusionPipeline
+
+ >>> # Download pipeline from huggingface.co and cache.
+ >>> pipeline = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+
+ >>> # Download pipeline that requires an authorization token
+ >>> # For more information on access tokens, please refer to this section
+ >>> # of the documentation](https://huggingface.co/docs/hub/security-tokens)
+ >>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+
+ >>> # Download pipeline, but overwrite scheduler
+ >>> from diffusers import LMSDiscreteScheduler
+
+ >>> scheduler = LMSDiscreteScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
+ >>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", scheduler=scheduler)
+ ```
+ """
+ cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+ resume_download = kwargs.pop("resume_download", False)
+ force_download = kwargs.pop("force_download", False)
+ proxies = kwargs.pop("proxies", None)
+ local_files_only = kwargs.pop("local_files_only", False)
+ use_auth_token = kwargs.pop("use_auth_token", None)
+ revision = kwargs.pop("revision", None)
+ torch_dtype = kwargs.pop("torch_dtype", None)
+ custom_pipeline = kwargs.pop("custom_pipeline", None)
+ provider = kwargs.pop("provider", None)
+ sess_options = kwargs.pop("sess_options", None)
+ device_map = kwargs.pop("device_map", None)
+ low_cpu_mem_usage = kwargs.pop(
+ "low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+
+ if low_cpu_mem_usage and not is_accelerate_available():
+ low_cpu_mem_usage = False
+ logger.warn(
+ "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+ " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+ " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+ " install accelerate\n```\n."
+ )
+
+ if device_map is not None and not is_torch_version(">=", "1.9.0"):
+ raise NotImplementedError(
+ "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+ " `device_map=None`."
+ )
+
+ if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+ raise NotImplementedError(
+ "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+ " `low_cpu_mem_usage=False`."
+ )
+
+ if low_cpu_mem_usage is False and device_map is not None:
+ raise ValueError(
+ f"You cannot set `low_cpu_mem_usage` to False while using device_map={device_map} for loading and"
+ " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+ )
+
+ # 1. Download the checkpoints and configs
+ # use snapshot download here to get it working from from_pretrained
+ if not os.path.isdir(pretrained_model_name_or_path):
+ config_dict = cls.get_config_dict(
+ pretrained_model_name_or_path,
+ cache_dir=cache_dir,
+ resume_download=resume_download,
+ force_download=force_download,
+ proxies=proxies,
+ local_files_only=local_files_only,
+ use_auth_token=use_auth_token,
+ revision=revision,
+ )
+ # make sure we only download sub-folders and `diffusers` filenames
+ folder_names = [
+ k for k in config_dict.keys() if not k.startswith("_")]
+ allow_patterns = [os.path.join(k, "*") for k in folder_names]
+ allow_patterns += [WEIGHTS_NAME, SCHEDULER_CONFIG_NAME,
+ CONFIG_NAME, ONNX_WEIGHTS_NAME, cls.config_name]
+
+ # make sure we don't download flax weights
+ ignore_patterns = "*.msgpack"
+
+ if custom_pipeline is not None:
+ allow_patterns += [CUSTOM_PIPELINE_FILE_NAME]
+
+ if cls != DiffusionPipeline:
+ requested_pipeline_class = cls.__name__
+ else:
+ requested_pipeline_class = config_dict.get(
+ "_class_name", cls.__name__)
+ user_agent = {"pipeline_class": requested_pipeline_class}
+ if custom_pipeline is not None:
+ user_agent["custom_pipeline"] = custom_pipeline
+ user_agent = http_user_agent(user_agent)
+
+ # download all allow_patterns
+ cached_folder = snapshot_download(
+ pretrained_model_name_or_path,
+ cache_dir=cache_dir,
+ resume_download=resume_download,
+ proxies=proxies,
+ local_files_only=local_files_only,
+ use_auth_token=use_auth_token,
+ revision=revision,
+ allow_patterns=allow_patterns,
+ ignore_patterns=ignore_patterns,
+ user_agent=user_agent,
+ )
+ else:
+ cached_folder = pretrained_model_name_or_path
+
+ config_dict = cls.get_config_dict(cached_folder)
+
+ # 2. Load the pipeline class, if using custom module then load it from the hub
+ # if we load from explicit class, let's use it
+ if custom_pipeline is not None:
+ pipeline_class = get_class_from_dynamic_module(
+ custom_pipeline, module_file=CUSTOM_PIPELINE_FILE_NAME, cache_dir=custom_pipeline
+ )
+ elif cls != DiffusionPipeline:
+ pipeline_class = cls
+ else:
+ diffusers_module = importlib.import_module(
+ cls.__module__.split(".")[0])
+ pipeline_class = getattr(
+ diffusers_module, config_dict["_class_name"])
+
+ # To be removed in 1.0.0
+ if pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and version.parse(
+ version.parse(config_dict["_diffusers_version"]).base_version
+ ) <= version.parse("0.5.1"):
+ from diffusers import StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy
+
+ pipeline_class = StableDiffusionInpaintPipelineLegacy
+
+ deprecation_message = (
+ "You are using a legacy checkpoint for inpainting with Stable Diffusion, therefore we are loading the"
+ f" {StableDiffusionInpaintPipelineLegacy} class instead of {StableDiffusionInpaintPipeline}. For"
+ " better inpainting results, we strongly suggest using Stable Diffusion's official inpainting"
+ " checkpoint: https://huggingface.co/runwayml/stable-diffusion-inpainting instead or adapting your"
+ f" checkpoint {pretrained_model_name_or_path} to the format of"
+ " https://huggingface.co/runwayml/stable-diffusion-inpainting. Note that we do not actively maintain"
+ " the {StableDiffusionInpaintPipelineLegacy} class and will likely remove it in version 1.0.0."
+ )
+ deprecate("StableDiffusionInpaintPipelineLegacy", "1.0.0",
+ deprecation_message, standard_warn=False)
+
+ # some modules can be passed directly to the init
+ # in this case they are already instantiated in `kwargs`
+ # extract them here
+ expected_modules = set(inspect.signature(
+ pipeline_class.__init__).parameters.keys()) - set(["self"])
+ passed_class_obj = {k: kwargs.pop(k)
+ for k in expected_modules if k in kwargs}
+
+ init_dict, unused_kwargs = pipeline_class.extract_init_dict(
+ config_dict, **kwargs)
+
+ if len(unused_kwargs) > 0:
+ logger.warning(f"Keyword arguments {unused_kwargs} not recognized.")
+
+ init_kwargs = {}
+
+ # import it here to avoid circular import
+ from diffusers import pipelines
+
+ # 3. Load each module in the pipeline
+ for name, (library_name, class_name) in init_dict.items():
+ if class_name is None:
+ # edge case for when the pipeline was saved with safety_checker=None
+ init_kwargs[name] = None
+ continue
+
+ # 3.1 - now that JAX/Flax is an official framework of the library, we might load from Flax names
+ if class_name.startswith("Flax"):
+ class_name = class_name[4:]
+
+ is_pipeline_module = hasattr(pipelines, library_name)
+ loaded_sub_model = None
+ sub_model_should_be_defined = True
+
+ # if the model is in a pipeline module, then we load it from the pipeline
+ if name in passed_class_obj:
+ # 1. check that passed_class_obj has correct parent class
+ if not is_pipeline_module and passed_class_obj[name] is not None:
+ library = importlib.import_module(library_name)
+ class_obj = getattr(library, class_name)
+ importable_classes = LOADABLE_CLASSES[library_name]
+ class_candidates = {c: getattr(
+ library, c, None) for c in importable_classes.keys()}
+
+ expected_class_obj = None
+ for class_name, class_candidate in class_candidates.items():
+ if class_candidate is not None and issubclass(class_obj, class_candidate):
+ expected_class_obj = class_candidate
+
+ if not issubclass(passed_class_obj[name].__class__, expected_class_obj):
+ raise ValueError(
+ f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be"
+ f" {expected_class_obj}"
+ )
+ elif passed_class_obj[name] is None:
+ logger.warn(
+ f"You have passed `None` for {name} to disable its functionality in {pipeline_class}. Note"
+ f" that this might lead to problems when using {pipeline_class} and is not recommended."
+ )
+ sub_model_should_be_defined = False
+ else:
+ logger.warn(
+ f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
+ " has the correct type"
+ )
+
+ # set passed class object
+ loaded_sub_model = passed_class_obj[name]
+ elif is_pipeline_module:
+ pipeline_module = getattr(pipelines, library_name)
+ class_obj = getattr(pipeline_module, class_name)
+ importable_classes = ALL_IMPORTABLE_CLASSES
+ class_candidates = {
+ c: class_obj for c in importable_classes.keys()}
+ else:
+ # else we just import it from the library.
+ library = importlib.import_module(library_name)
+
+ class_obj = getattr(library, class_name)
+ importable_classes = LOADABLE_CLASSES[library_name]
+ class_candidates = {c: getattr(
+ library, c, None) for c in importable_classes.keys()}
+ if loaded_sub_model is None and sub_model_should_be_defined:
+ load_method_name = None
+ for class_name, class_candidate in class_candidates.items():
+ if class_candidate is not None and issubclass(class_obj, class_candidate):
+ load_method_name = importable_classes[class_name][1]
+
+ if load_method_name is None:
+ none_module = class_obj.__module__
+ if none_module.startswith(DUMMY_MODULES_FOLDER) and "dummy" in none_module:
+ # call class_obj for nice error message of missing requirements
+ class_obj()
+
+ raise ValueError(
+ f"The component {class_obj} of {pipeline_class} cannot be loaded as it does not seem to have"
+ f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}."
+ )
+
+ load_method = getattr(class_obj, load_method_name)
+ loading_kwargs = {}
+
+ if issubclass(class_obj, torch.nn.Module):
+ loading_kwargs["torch_dtype"] = torch_dtype
+ if issubclass(class_obj, diffusers.OnnxRuntimeModel):
+ loading_kwargs["provider"] = provider
+ loading_kwargs["sess_options"] = sess_options
+
+ is_diffusers_model = issubclass(class_obj, diffusers.ModelMixin)
+ is_transformers_model = (
+ is_transformers_available()
+ and issubclass(class_obj, PreTrainedModel)
+ and version.parse(version.parse(transformers.__version__).base_version) >= version.parse("4.20.0")
+ )
+
+ # When loading a transformers model, if the device_map is None, the weights will be initialized as opposed to diffusers.
+ # To make default loading faster we set the `low_cpu_mem_usage=low_cpu_mem_usage` flag which is `True` by default.
+ # This makes sure that the weights won't be initialized which significantly speeds up loading.
+ if is_diffusers_model or is_transformers_model:
+ loading_kwargs["device_map"] = device_map
+ loading_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
+
+ # check if the module is in a subdirectory
+ if os.path.isdir(os.path.join(cached_folder, name)):
+ loaded_sub_model = load_method(os.path.join(
+ cached_folder, name), **loading_kwargs)
+ else:
+ # else load from the root directory
+ loaded_sub_model = load_method(
+ cached_folder, **loading_kwargs)
+
+ # UNet(...), # DiffusionSchedule(...)
+ init_kwargs[name] = loaded_sub_model
+
+ # 4. Potentially add passed objects if expected
+ missing_modules = set(expected_modules) - set(init_kwargs.keys())
+ if len(missing_modules) > 0 and missing_modules <= set(passed_class_obj.keys()):
+ for module in missing_modules:
+ init_kwargs[module] = passed_class_obj[module]
+ elif len(missing_modules) > 0:
+ passed_modules = set(list(init_kwargs.keys()) +
+ list(passed_class_obj.keys()))
+ raise ValueError(
+ f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed."
+ )
+
+ # 5. Instantiate the pipeline
+ model = pipeline_class(**init_kwargs)
+ return model
+
+ @property
+ def components(self) -> Dict[str, Any]:
+ r"""
+
+ The `self.components` property can be useful to run different pipelines with the same weights and
+ configurations to not have to re-allocate memory.
+
+ Examples:
+
+ ```py
+ >>> from diffusers import (
+ ... StableDiffusionPipeline,
+ ... StableDiffusionImg2ImgPipeline,
+ ... StableDiffusionInpaintPipeline,
+ ... )
+
+ >>> img2text = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+ >>> img2img = StableDiffusionImg2ImgPipeline(**img2text.components)
+ >>> inpaint = StableDiffusionInpaintPipeline(**img2text.components)
+ ```
+
+ Returns:
+ A dictionaly containing all the modules needed to initialize the pipeline.
+ """
+ components = {k: getattr(self, k)
+ for k in self.config.keys() if not k.startswith("_")}
+ expected_modules = set(inspect.signature(
+ self.__init__).parameters.keys()) - set(["self"])
+
+ if set(components.keys()) != expected_modules:
+ raise ValueError(
+ f"{self} has been incorrectly initialized or {self.__class__} is incorrectly implemented. Expected"
+ f" {expected_modules} to be defined, but {components} are defined."
+ )
+
+ return components
+
+ @staticmethod
+ def numpy_to_pil(images):
+ """
+ Convert a numpy image or a batch of images to a PIL image.
+ """
+ if images.ndim == 3:
+ images = images[None, ...]
+ images = (images * 255).round().astype("uint8")
+ if images.shape[-1] == 1:
+ # special case for grayscale (single channel) images
+ pil_images = [Image.fromarray(
+ image.squeeze(), mode="L") for image in images]
+ else:
+ pil_images = [Image.fromarray(image) for image in images]
+
+ return pil_images
+
+ def progress_bar(self, iterable):
+ if not hasattr(self, "_progress_bar_config"):
+ self._progress_bar_config = {}
+ elif not isinstance(self._progress_bar_config, dict):
+ raise ValueError(
+ f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+ )
+
+ return tqdm(iterable, **self._progress_bar_config)
+
+ def set_progress_bar_config(self, **kwargs):
+ self._progress_bar_config = kwargs
diff --git a/src/model/TextGen/diffusers/pipelines/README.md b/src/model/TextGen/diffusers/pipelines/README.md
new file mode 100644
index 0000000..2941660
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/README.md
@@ -0,0 +1,173 @@
+# 🧨 Diffusers Pipelines
+
+Pipelines provide a simple way to run state-of-the-art diffusion models in inference.
+Most diffusion systems consist of multiple independently-trained models and highly adaptable scheduler
+components - all of which are needed to have a functioning end-to-end diffusion system.
+
+As an example, [Stable Diffusion](https://huggingface.co/blog/stable_diffusion) has three independently trained models:
+- [Autoencoder](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/models/vae.py#L392)
+- [Conditional Unet](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/models/unet_2d_condition.py#L12)
+- [CLIP text encoder](https://huggingface.co/docs/transformers/v4.21.2/en/model_doc/clip#transformers.CLIPTextModel)
+- a scheduler component, [scheduler](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_pndm.py),
+- a [CLIPFeatureExtractor](https://huggingface.co/docs/transformers/v4.21.2/en/model_doc/clip#transformers.CLIPFeatureExtractor),
+- as well as a [safety checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py).
+All of these components are necessary to run stable diffusion in inference even though they were trained
+or created independently from each other.
+
+To that end, we strive to offer all open-sourced, state-of-the-art diffusion system under a unified API.
+More specifically, we strive to provide pipelines that
+- 1. can load the officially published weights and yield 1-to-1 the same outputs as the original implementation according to the corresponding paper (*e.g.* [LDMTextToImagePipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/latent_diffusion), uses the officially released weights of [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)),
+- 2. have a simple user interface to run the model in inference (see the [Pipelines API](#pipelines-api) section),
+- 3. are easy to understand with code that is self-explanatory and can be read along-side the official paper (see [Pipelines summary](#pipelines-summary)),
+- 4. can easily be contributed by the community (see the [Contribution](#contribution) section).
+
+**Note** that pipelines do not (and should not) offer any training functionality.
+If you are looking for *official* training examples, please have a look at [examples](https://github.com/huggingface/diffusers/tree/main/examples).
+
+
+## Pipelines Summary
+
+The following table summarizes all officially supported pipelines, their corresponding paper, and if
+available a colab notebook to directly try them out.
+
+| Pipeline | Source | Tasks | Colab
+|-------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|:---:|:---:|
+| [dance diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/Harmonai-org/sample-generator) | *Unconditional Audio Generation* |
+| [ddpm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | *Unconditional Image Generation* |
+| [ddim](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | *Unconditional Image Generation* | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [latent_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | *Text-to-Image Generation* |
+| [latent_diffusion_uncond](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | *Unconditional Image Generation* |
+| [pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | *Unconditional Image Generation* |
+| [score_sde_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* |
+| [score_sde_vp](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* |
+| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | *Text-to-Image Generation* | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | *Image-to-Image Text-Guided Generation* | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | *Text-Guided Image Inpainting* | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stochastic_karras_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | *Unconditional Image Generation* |
+
+**Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers.
+However, most of them can be adapted to use different scheduler components or even different model components. Some pipeline examples are shown in the [Examples](#examples) below.
+
+## Pipelines API
+
+Diffusion models often consist of multiple independently-trained models or other previously existing components.
+
+
+Each model has been trained independently on a different task and the scheduler can easily be swapped out and replaced with a different one.
+During inference, we however want to be able to easily load all components and use them in inference - even if one component, *e.g.* CLIP's text encoder, originates from a different library, such as [Transformers](https://github.com/huggingface/transformers). To that end, all pipelines provide the following functionality:
+
+- [`from_pretrained` method](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L139) that accepts a Hugging Face Hub repository id, *e.g.* [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) or a path to a local directory, *e.g.*
+"./stable-diffusion". To correctly retrieve which models and components should be loaded, one has to provide a `model_index.json` file, *e.g.* [runwayml/stable-diffusion-v1-5/model_index.json](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json), which defines all components that should be
+loaded into the pipelines. More specifically, for each model/component one needs to define the format `: ["", ""]`. `` is the attribute name given to the loaded instance of `` which can be found in the library or pipeline folder called `""`.
+- [`save_pretrained`](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L90) that accepts a local path, *e.g.* `./stable-diffusion` under which all models/components of the pipeline will be saved. For each component/model a folder is created inside the local path that is named after the given attribute name, *e.g.* `./stable_diffusion/unet`.
+In addition, a `model_index.json` file is created at the root of the local path, *e.g.* `./stable_diffusion/model_index.json` so that the complete pipeline can again be instantiated
+from the local path.
+- [`to`](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L118) which accepts a `string` or `torch.device` to move all models that are of type `torch.nn.Module` to the passed device. The behavior is fully analogous to [PyTorch's `to` method](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.to).
+- [`__call__`] method to use the pipeline in inference. `__call__` defines inference logic of the pipeline and should ideally encompass all aspects of it, from pre-processing to forwarding tensors to the different models and schedulers, as well as post-processing. The API of the `__call__` method can strongly vary from pipeline to pipeline. *E.g.* a text-to-image pipeline, such as [`StableDiffusionPipeline`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py) should accept among other things the text prompt to generate the image. A pure image generation pipeline, such as [DDPMPipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/ddpm) on the other hand can be run without providing any inputs. To better understand what inputs can be adapted for
+each pipeline, one should look directly into the respective pipeline.
+
+**Note**: All pipelines have PyTorch's autograd disabled by decorating the `__call__` method with a [`torch.no_grad`](https://pytorch.org/docs/stable/generated/torch.no_grad.html) decorator because pipelines should
+not be used for training. If you want to store the gradients during the forward pass, we recommend writing your own pipeline, see also our [community-examples](https://github.com/huggingface/diffusers/tree/main/examples/community)
+
+## Contribution
+
+We are more than happy about any contribution to the officially supported pipelines 🤗. We aspire
+all of our pipelines to be **self-contained**, **easy-to-tweak**, **beginner-friendly** and for **one-purpose-only**.
+
+- **Self-contained**: A pipeline shall be as self-contained as possible. More specifically, this means that all functionality should be either directly defined in the pipeline file itself, should be inherited from (and only from) the [`DiffusionPipeline` class](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L56) or be directly attached to the model and scheduler components of the pipeline.
+- **Easy-to-use**: Pipelines should be extremely easy to use - one should be able to load the pipeline and
+use it for its designated task, *e.g.* text-to-image generation, in just a couple of lines of code. Most
+logic including pre-processing, an unrolled diffusion loop, and post-processing should all happen inside the `__call__` method.
+- **Easy-to-tweak**: Certain pipelines will not be able to handle all use cases and tasks that you might like them to. If you want to use a certain pipeline for a specific use case that is not yet supported, you might have to copy the pipeline file and tweak the code to your needs. We try to make the pipeline code as readable as possible so that each part –from pre-processing to diffusing to post-processing– can easily be adapted. If you would like the community to benefit from your customized pipeline, we would love to see a contribution to our [community-examples](https://github.com/huggingface/diffusers/tree/main/examples/community). If you feel that an important pipeline should be part of the official pipelines but isn't, a contribution to the [official pipelines](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines) would be even better.
+- **One-purpose-only**: Pipelines should be used for one task and one task only. Even if two tasks are very similar from a modeling point of view, *e.g.* image2image translation and in-painting, pipelines shall be used for one task only to keep them *easy-to-tweak* and *readable*.
+
+## Examples
+
+### Text-to-Image generation with Stable Diffusion
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
+
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+
+image.save("astronaut_rides_horse.png")
+```
+
+### Image-to-Image text-guided generation with Stable Diffusion
+
+The `StableDiffusionImg2ImgPipeline` lets you pass a text prompt and an initial image to condition the generation of new images.
+
+```python
+import requests
+from PIL import Image
+from io import BytesIO
+
+from diffusers import StableDiffusionImg2ImgPipeline
+
+# load the pipeline
+device = "cuda"
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+ revision="fp16",
+ torch_dtype=torch.float16,
+).to(device)
+
+# let's download an initial image
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((768, 512))
+
+prompt = "A fantasy landscape, trending on artstation"
+
+images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
+
+images[0].save("fantasy_landscape.png")
+```
+You can also run this example on colab [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+
+### Tweak prompts reusing seeds and latents
+
+You can generate your own latents to reproduce results, or tweak your prompt on a specific result you liked. [This notebook](https://github.com/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) shows how to do it step by step. You can also run it in Google Colab [](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb).
+
+
+### In-painting using Stable Diffusion
+
+The `StableDiffusionInpaintPipeline` lets you edit specific parts of an image by providing a mask and text prompt.
+
+```python
+import PIL
+import requests
+import torch
+from io import BytesIO
+
+from diffusers import StableDiffusionInpaintPipeline
+
+def download_image(url):
+ response = requests.get(url)
+ return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+
+pipe = StableDiffusionInpaintPipeline.from_pretrained(
+ "runwayml/stable-diffusion-inpainting",
+ revision="fp16",
+ torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+```
+
+You can also run this example on colab [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
diff --git a/src/model/TextGen/diffusers/pipelines/__init__.py b/src/model/TextGen/diffusers/pipelines/__init__.py
new file mode 100644
index 0000000..ef4d23e
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/__init__.py
@@ -0,0 +1,37 @@
+from ..utils import is_flax_available, is_onnx_available, is_torch_available, is_transformers_available
+
+
+if is_torch_available():
+ from .dance_diffusion import DanceDiffusionPipeline
+ from .ddim import DDIMPipeline
+ from .ddpm import DDPMPipeline
+ from .latent_diffusion import LDMSuperResolutionPipeline
+ from .latent_diffusion_uncond import LDMPipeline
+ from .pndm import PNDMPipeline
+ from .repaint import RePaintPipeline
+ from .score_sde_ve import ScoreSdeVePipeline
+ from .stochastic_karras_ve import KarrasVePipeline
+else:
+ from ..utils.dummy_pt_objects import * # noqa F403
+
+if is_torch_available() and is_transformers_available():
+ from .latent_diffusion import LDMTextToImagePipeline
+ from .stable_diffusion import (
+ CycleDiffusionPipeline,
+ StableDiffusionImg2ImgPipeline,
+ StableDiffusionInpaintPipeline,
+ StableDiffusionInpaintPipelineLegacy,
+ StableDiffusionPipeline,
+ )
+ from .vq_diffusion import VQDiffusionPipeline
+
+if is_transformers_available() and is_onnx_available():
+ from .stable_diffusion import (
+ OnnxStableDiffusionImg2ImgPipeline,
+ OnnxStableDiffusionInpaintPipeline,
+ OnnxStableDiffusionPipeline,
+ StableDiffusionOnnxPipeline,
+ )
+
+if is_transformers_available() and is_flax_available():
+ from .stable_diffusion import FlaxStableDiffusionPipeline
diff --git a/src/model/TextGen/diffusers/pipelines/dance_diffusion/__init__.py b/src/model/TextGen/diffusers/pipelines/dance_diffusion/__init__.py
new file mode 100644
index 0000000..2ad34fc
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/dance_diffusion/__init__.py
@@ -0,0 +1,2 @@
+# flake8: noqa
+from .pipeline_dance_diffusion import DanceDiffusionPipeline
diff --git a/src/model/TextGen/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/src/model/TextGen/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
new file mode 100644
index 0000000..48d1688
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -0,0 +1,119 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ...pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+class DanceDiffusionPipeline(DiffusionPipeline):
+ r"""
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Parameters:
+ unet ([`UNet1DModel`]): U-Net architecture to denoise the encoded image.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+ [`IPNDMScheduler`].
+ """
+
+ def __init__(self, unet, scheduler):
+ super().__init__()
+ self.register_modules(unet=unet, scheduler=scheduler)
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ batch_size: int = 1,
+ num_inference_steps: int = 100,
+ generator: Optional[torch.Generator] = None,
+ audio_length_in_s: Optional[float] = None,
+ return_dict: bool = True,
+ ) -> Union[AudioPipelineOutput, Tuple]:
+ r"""
+ Args:
+ batch_size (`int`, *optional*, defaults to 1):
+ The number of audio samples to generate.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality audio sample at
+ the expense of slower inference.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ audio_length_in_s (`float`, *optional*, defaults to `self.unet.config.sample_size/self.unet.config.sample_rate`):
+ The length of the generated audio sample in seconds. Note that the output of the pipeline, *i.e.*
+ `sample_size`, will be `audio_length_in_s` * `self.unet.sample_rate`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipeline_utils.AudioPipelineOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~pipeline_utils.AudioPipelineOutput`] or `tuple`: [`~pipelines.utils.AudioPipelineOutput`] if
+ `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+ generated images.
+ """
+
+ if audio_length_in_s is None:
+ audio_length_in_s = self.unet.config.sample_size / self.unet.config.sample_rate
+
+ sample_size = audio_length_in_s * self.unet.sample_rate
+
+ down_scale_factor = 2 ** len(self.unet.up_blocks)
+ if sample_size < 3 * down_scale_factor:
+ raise ValueError(
+ f"{audio_length_in_s} is too small. Make sure it's bigger or equal to"
+ f" {3 * down_scale_factor / self.unet.sample_rate}."
+ )
+
+ original_sample_size = int(sample_size)
+ if sample_size % down_scale_factor != 0:
+ sample_size = ((audio_length_in_s * self.unet.sample_rate) // down_scale_factor + 1) * down_scale_factor
+ logger.info(
+ f"{audio_length_in_s} is increased to {sample_size / self.unet.sample_rate} so that it can be handled"
+ f" by the model. It will be cut to {original_sample_size / self.unet.sample_rate} after the denoising"
+ " process."
+ )
+ sample_size = int(sample_size)
+
+ dtype = next(iter(self.unet.parameters())).dtype
+ audio = torch.randn(
+ (batch_size, self.unet.in_channels, sample_size), generator=generator, device=self.device, dtype=dtype
+ )
+
+ # set step values
+ self.scheduler.set_timesteps(num_inference_steps, device=audio.device)
+ self.scheduler.timesteps = self.scheduler.timesteps.to(dtype)
+
+ for t in self.progress_bar(self.scheduler.timesteps):
+ # 1. predict noise model_output
+ model_output = self.unet(audio, t).sample
+
+ # 2. compute previous image: x_t -> t_t-1
+ audio = self.scheduler.step(model_output, t, audio).prev_sample
+
+ audio = audio.clamp(-1, 1).float().cpu().numpy()
+
+ audio = audio[:, :, :original_sample_size]
+
+ if not return_dict:
+ return (audio,)
+
+ return AudioPipelineOutput(audios=audio)
diff --git a/src/model/TextGen/diffusers/pipelines/ddim/__init__.py b/src/model/TextGen/diffusers/pipelines/ddim/__init__.py
new file mode 100644
index 0000000..8fd3186
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/ddim/__init__.py
@@ -0,0 +1,2 @@
+# flake8: noqa
+from .pipeline_ddim import DDIMPipeline
diff --git a/src/model/TextGen/diffusers/pipelines/ddim/pipeline_ddim.py b/src/model/TextGen/diffusers/pipelines/ddim/pipeline_ddim.py
new file mode 100644
index 0000000..6db6298
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -0,0 +1,122 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...utils import deprecate
+
+
+class DDIMPipeline(DiffusionPipeline):
+ r"""
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Parameters:
+ unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+ [`DDPMScheduler`], or [`DDIMScheduler`].
+ """
+
+ def __init__(self, unet, scheduler):
+ super().__init__()
+ self.register_modules(unet=unet, scheduler=scheduler)
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ batch_size: int = 1,
+ generator: Optional[torch.Generator] = None,
+ eta: float = 0.0,
+ num_inference_steps: int = 50,
+ use_clipped_model_output: Optional[bool] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[ImagePipelineOutput, Tuple]:
+ r"""
+ Args:
+ batch_size (`int`, *optional*, defaults to 1):
+ The number of images to generate.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ eta (`float`, *optional*, defaults to 0.0):
+ The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ use_clipped_model_output (`bool`, *optional*, defaults to `None`):
+ if `True` or `False`, see documentation for `DDIMScheduler.step`. If `None`, nothing is passed
+ downstream to the scheduler. So use `None` for schedulers which don't support this argument.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+ `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+ generated images.
+ """
+
+ if generator is not None and generator.device.type != self.device.type and self.device.type != "mps":
+ message = (
+ f"The `generator` device is `{generator.device}` and does not match the pipeline "
+ f"device `{self.device}`, so the `generator` will be ignored. "
+ f'Please use `generator=torch.Generator(device="{self.device}")` instead.'
+ )
+ deprecate(
+ "generator.device == 'cpu'",
+ "0.11.0",
+ message,
+ )
+ generator = None
+
+ # Sample gaussian noise to begin loop
+ image_shape = (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size)
+ if self.device.type == "mps":
+ # randn does not work reproducibly on mps
+ image = torch.randn(image_shape, generator=generator)
+ image = image.to(self.device)
+ else:
+ image = torch.randn(image_shape, generator=generator, device=self.device)
+
+ # set step values
+ self.scheduler.set_timesteps(num_inference_steps)
+
+ for t in self.progress_bar(self.scheduler.timesteps):
+ # 1. predict noise model_output
+ model_output = self.unet(image, t).sample
+
+ # 2. predict previous mean of image x_t-1 and add variance depending on eta
+ # eta corresponds to η in paper and should be between [0, 1]
+ # do x_t -> x_t-1
+ image = self.scheduler.step(
+ model_output, t, image, eta=eta, use_clipped_model_output=use_clipped_model_output, generator=generator
+ ).prev_sample
+
+ image = (image / 2 + 0.5).clamp(0, 1)
+ image = image.cpu().permute(0, 2, 3, 1).numpy()
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image,)
+
+ return ImagePipelineOutput(images=image)
diff --git a/src/model/TextGen/diffusers/pipelines/ddpm/__init__.py b/src/model/TextGen/diffusers/pipelines/ddpm/__init__.py
new file mode 100644
index 0000000..8889bda
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/ddpm/__init__.py
@@ -0,0 +1,2 @@
+# flake8: noqa
+from .pipeline_ddpm import DDPMPipeline
diff --git a/src/model/TextGen/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/model/TextGen/diffusers/pipelines/ddpm/pipeline_ddpm.py
new file mode 100644
index 0000000..b719466
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -0,0 +1,125 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ...configuration_utils import FrozenDict
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...utils import deprecate
+
+
+class DDPMPipeline(DiffusionPipeline):
+ r"""
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Parameters:
+ unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+ [`DDPMScheduler`], or [`DDIMScheduler`].
+ """
+
+ def __init__(self, unet, scheduler):
+ super().__init__()
+ self.register_modules(unet=unet, scheduler=scheduler)
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ batch_size: int = 1,
+ generator: Optional[torch.Generator] = None,
+ num_inference_steps: int = 1000,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[ImagePipelineOutput, Tuple]:
+ r"""
+ Args:
+ batch_size (`int`, *optional*, defaults to 1):
+ The number of images to generate.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ num_inference_steps (`int`, *optional*, defaults to 1000):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+ `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+ generated images.
+ """
+ message = (
+ "Please make sure to instantiate your scheduler with `predict_epsilon` instead. E.g. `scheduler ="
+ " DDPMScheduler.from_config(, predict_epsilon=True)`."
+ )
+ predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
+
+ if predict_epsilon is not None:
+ new_config = dict(self.scheduler.config)
+ new_config["predict_epsilon"] = predict_epsilon
+ self.scheduler._internal_dict = FrozenDict(new_config)
+
+ if generator is not None and generator.device.type != self.device.type and self.device.type != "mps":
+ message = (
+ f"The `generator` device is `{generator.device}` and does not match the pipeline "
+ f"device `{self.device}`, so the `generator` will be ignored. "
+ f'Please use `torch.Generator(device="{self.device}")` instead.'
+ )
+ deprecate(
+ "generator.device == 'cpu'",
+ "0.11.0",
+ message,
+ )
+ generator = None
+
+ # Sample gaussian noise to begin loop
+ image_shape = (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size)
+ if self.device.type == "mps":
+ # randn does not work reproducibly on mps
+ image = torch.randn(image_shape, generator=generator)
+ image = image.to(self.device)
+ else:
+ image = torch.randn(image_shape, generator=generator, device=self.device)
+
+ # set step values
+ self.scheduler.set_timesteps(num_inference_steps)
+
+ for t in self.progress_bar(self.scheduler.timesteps):
+ # 1. predict noise model_output
+ model_output = self.unet(image, t).sample
+
+ # 2. compute previous image: x_t -> x_t-1
+ image = self.scheduler.step(
+ model_output, t, image, generator=generator, predict_epsilon=predict_epsilon
+ ).prev_sample
+
+ image = (image / 2 + 0.5).clamp(0, 1)
+ image = image.cpu().permute(0, 2, 3, 1).numpy()
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image,)
+
+ return ImagePipelineOutput(images=image)
diff --git a/src/model/TextGen/diffusers/pipelines/latent_diffusion/__init__.py b/src/model/TextGen/diffusers/pipelines/latent_diffusion/__init__.py
new file mode 100644
index 0000000..5544527
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/latent_diffusion/__init__.py
@@ -0,0 +1,7 @@
+# flake8: noqa
+from ...utils import is_transformers_available
+from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline
+
+
+if is_transformers_available():
+ from .pipeline_latent_diffusion import LDMBertModel, LDMTextToImagePipeline
diff --git a/src/model/TextGen/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/model/TextGen/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
new file mode 100644
index 0000000..feb5b00
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -0,0 +1,707 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.modeling_utils import PreTrainedModel
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+
+from ...models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+
+
+class LDMTextToImagePipeline(DiffusionPipeline):
+ r"""
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Parameters:
+ vqvae ([`VQModel`]):
+ Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
+ bert ([`LDMBertModel`]):
+ Text-encoder model based on [BERT](https://huggingface.co/docs/transformers/model_doc/bert) architecture.
+ tokenizer (`transformers.BertTokenizer`):
+ Tokenizer of class
+ [BertTokenizer](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ """
+
+ def __init__(
+ self,
+ vqvae: Union[VQModel, AutoencoderKL],
+ bert: PreTrainedModel,
+ tokenizer: PreTrainedTokenizer,
+ unet: Union[UNet2DModel, UNet2DConditionModel],
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ ):
+ super().__init__()
+ self.register_modules(vqvae=vqvae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]],
+ height: Optional[int] = 256,
+ width: Optional[int] = 256,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 1.0,
+ eta: Optional[float] = 0.0,
+ generator: Optional[torch.Generator] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[Tuple, ImagePipelineOutput]:
+ r"""
+ Args:
+ prompt (`str` or `List[str]`):
+ The prompt or prompts to guide the image generation.
+ height (`int`, *optional*, defaults to 256):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to 256):
+ The width in pixels of the generated image.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 1.0):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt` at
+ the, usually at the expense of lower image quality.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+ `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+ generated images.
+ """
+
+ if isinstance(prompt, str):
+ batch_size = 1
+ elif isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+ # get unconditional embeddings for classifier free guidance
+ if guidance_scale != 1.0:
+ uncond_input = self.tokenizer([""] * batch_size, padding="max_length", max_length=77, return_tensors="pt")
+ uncond_embeddings = self.bert(uncond_input.input_ids.to(self.device))[0]
+
+ # get prompt text embeddings
+ text_input = self.tokenizer(prompt, padding="max_length", max_length=77, return_tensors="pt")
+ text_embeddings = self.bert(text_input.input_ids.to(self.device))[0]
+
+ latents = torch.randn(
+ (batch_size, self.unet.in_channels, height // 8, width // 8),
+ generator=generator,
+ )
+ latents = latents.to(self.device)
+
+ self.scheduler.set_timesteps(num_inference_steps)
+
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+
+ extra_kwargs = {}
+ if accepts_eta:
+ extra_kwargs["eta"] = eta
+
+ for t in self.progress_bar(self.scheduler.timesteps):
+ if guidance_scale == 1.0:
+ # guidance_scale of 1 means no guidance
+ latents_input = latents
+ context = text_embeddings
+ else:
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ latents_input = torch.cat([latents] * 2)
+ context = torch.cat([uncond_embeddings, text_embeddings])
+
+ # predict the noise residual
+ noise_pred = self.unet(latents_input, t, encoder_hidden_states=context).sample
+ # perform guidance
+ if guidance_scale != 1.0:
+ noise_pred_uncond, noise_prediction_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
+
+ # scale and decode the image latents with vae
+ latents = 1 / 0.18215 * latents
+ image = self.vqvae.decode(latents).sample
+
+ image = (image / 2 + 0.5).clamp(0, 1)
+ image = image.cpu().permute(0, 2, 3, 1).numpy()
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image,)
+
+ return ImagePipelineOutput(images=image)
+
+
+################################################################################
+# Code for the text transformer model
+################################################################################
+""" PyTorch LDMBERT model."""
+
+
+logger = logging.get_logger(__name__)
+
+LDMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+ "ldm-bert",
+ # See all LDMBert models at https://huggingface.co/models?filter=ldmbert
+]
+
+
+LDMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+ "ldm-bert": "https://huggingface.co/valhalla/ldm-bert/blob/main/config.json",
+}
+
+
+""" LDMBERT model configuration"""
+
+
+class LDMBertConfig(PretrainedConfig):
+ model_type = "ldmbert"
+ keys_to_ignore_at_inference = ["past_key_values"]
+ attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+
+ def __init__(
+ self,
+ vocab_size=30522,
+ max_position_embeddings=77,
+ encoder_layers=32,
+ encoder_ffn_dim=5120,
+ encoder_attention_heads=8,
+ head_dim=64,
+ encoder_layerdrop=0.0,
+ activation_function="gelu",
+ d_model=1280,
+ dropout=0.1,
+ attention_dropout=0.0,
+ activation_dropout=0.0,
+ init_std=0.02,
+ classifier_dropout=0.0,
+ scale_embedding=False,
+ use_cache=True,
+ pad_token_id=0,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.d_model = d_model
+ self.encoder_ffn_dim = encoder_ffn_dim
+ self.encoder_layers = encoder_layers
+ self.encoder_attention_heads = encoder_attention_heads
+ self.head_dim = head_dim
+ self.dropout = dropout
+ self.attention_dropout = attention_dropout
+ self.activation_dropout = activation_dropout
+ self.activation_function = activation_function
+ self.init_std = init_std
+ self.encoder_layerdrop = encoder_layerdrop
+ self.classifier_dropout = classifier_dropout
+ self.use_cache = use_cache
+ self.num_hidden_layers = encoder_layers
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
+
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->LDMBert
+class LDMBertAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(
+ self,
+ embed_dim: int,
+ num_heads: int,
+ head_dim: int,
+ dropout: float = 0.0,
+ is_decoder: bool = False,
+ bias: bool = False,
+ ):
+ super().__init__()
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+ self.dropout = dropout
+ self.head_dim = head_dim
+ self.inner_dim = head_dim * num_heads
+
+ self.scaling = self.head_dim**-0.5
+ self.is_decoder = is_decoder
+
+ self.k_proj = nn.Linear(embed_dim, self.inner_dim, bias=bias)
+ self.v_proj = nn.Linear(embed_dim, self.inner_dim, bias=bias)
+ self.q_proj = nn.Linear(embed_dim, self.inner_dim, bias=bias)
+ self.out_proj = nn.Linear(self.inner_dim, embed_dim)
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ key_value_states: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ layer_head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ """Input shape: Batch x Time x Channel"""
+
+ # if key_value_states are provided this layer is used as a cross-attention layer
+ # for the decoder
+ is_cross_attention = key_value_states is not None
+
+ bsz, tgt_len, _ = hidden_states.size()
+
+ # get query proj
+ query_states = self.q_proj(hidden_states) * self.scaling
+ # get key, value proj
+ if is_cross_attention and past_key_value is not None:
+ # reuse k,v, cross_attentions
+ key_states = past_key_value[0]
+ value_states = past_key_value[1]
+ elif is_cross_attention:
+ # cross_attentions
+ key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+ value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+ elif past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+ else:
+ # self_attention
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+ if self.is_decoder:
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+ # Further calls to cross_attention layer can then reuse all cross-attention
+ # key/value_states (first "if" case)
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
+ past_key_value = (key_states, value_states)
+
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+ key_states = key_states.view(*proj_shape)
+ value_states = value_states.view(*proj_shape)
+
+ src_len = key_states.size(1)
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+ if layer_head_mask is not None:
+ if layer_head_mask.size() != (self.num_heads,):
+ raise ValueError(
+ f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+ f" {layer_head_mask.size()}"
+ )
+ attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+ if output_attentions:
+ # this operation is a bit awkward, but it's required to
+ # make sure that attn_weights keeps its gradient.
+ # In order to do so, attn_weights have to be reshaped
+ # twice and have to be reused in the following
+ attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+ attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+ else:
+ attn_weights_reshaped = None
+
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+ attn_output = torch.bmm(attn_probs, value_states)
+
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+ attn_output = attn_output.transpose(1, 2)
+
+ # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+ # partitioned across GPUs when using tensor-parallelism.
+ attn_output = attn_output.reshape(bsz, tgt_len, self.inner_dim)
+
+ attn_output = self.out_proj(attn_output)
+
+ return attn_output, attn_weights_reshaped, past_key_value
+
+
+class LDMBertEncoderLayer(nn.Module):
+ def __init__(self, config: LDMBertConfig):
+ super().__init__()
+ self.embed_dim = config.d_model
+ self.self_attn = LDMBertAttention(
+ embed_dim=self.embed_dim,
+ num_heads=config.encoder_attention_heads,
+ head_dim=config.head_dim,
+ dropout=config.attention_dropout,
+ )
+ self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+ self.dropout = config.dropout
+ self.activation_fn = ACT2FN[config.activation_function]
+ self.activation_dropout = config.activation_dropout
+ self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+ self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+ self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+ def forward(
+ self,
+ hidden_states: torch.FloatTensor,
+ attention_mask: torch.FloatTensor,
+ layer_head_mask: torch.FloatTensor,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+ attention_mask (`torch.FloatTensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+ `(encoder_attention_heads,)`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+ residual = hidden_states
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+ hidden_states, attn_weights, _ = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ layer_head_mask=layer_head_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+
+ residual = hidden_states
+ hidden_states = self.final_layer_norm(hidden_states)
+ hidden_states = self.activation_fn(self.fc1(hidden_states))
+ hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+ hidden_states = self.fc2(hidden_states)
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+ hidden_states = residual + hidden_states
+
+ if hidden_states.dtype == torch.float16 and (
+ torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+ ):
+ clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+ hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+# Copied from transformers.models.bart.modeling_bart.BartPretrainedModel with Bart->LDMBert
+class LDMBertPreTrainedModel(PreTrainedModel):
+ config_class = LDMBertConfig
+ base_model_prefix = "model"
+ _supports_gradient_checkpointing = True
+ _keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"]
+
+ def _init_weights(self, module):
+ std = self.config.init_std
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, (LDMBertEncoder,)):
+ module.gradient_checkpointing = value
+
+ @property
+ def dummy_inputs(self):
+ pad_token = self.config.pad_token_id
+ input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+ dummy_inputs = {
+ "attention_mask": input_ids.ne(pad_token),
+ "input_ids": input_ids,
+ }
+ return dummy_inputs
+
+
+class LDMBertEncoder(LDMBertPreTrainedModel):
+ """
+ Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+ [`LDMBertEncoderLayer`].
+
+ Args:
+ config: LDMBertConfig
+ embed_tokens (nn.Embedding): output embedding
+ """
+
+ def __init__(self, config: LDMBertConfig):
+ super().__init__(config)
+
+ self.dropout = config.dropout
+
+ embed_dim = config.d_model
+ self.padding_idx = config.pad_token_id
+ self.max_source_positions = config.max_position_embeddings
+
+ self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim)
+ self.embed_positions = nn.Embedding(config.max_position_embeddings, embed_dim)
+ self.layers = nn.ModuleList([LDMBertEncoderLayer(config) for _ in range(config.encoder_layers)])
+ self.layer_norm = nn.LayerNorm(embed_dim)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutput]:
+ r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+ provide it.
+
+ Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ than the model's internal embedding lookup matrix.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.BaseModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ input_shape = input_ids.size()
+ input_ids = input_ids.view(-1, input_shape[-1])
+ elif inputs_embeds is not None:
+ input_shape = inputs_embeds.size()[:-1]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+
+ seq_len = input_shape[1]
+ if position_ids is None:
+ position_ids = torch.arange(seq_len, dtype=torch.long, device=inputs_embeds.device).expand((1, -1))
+ embed_pos = self.embed_positions(position_ids)
+
+ hidden_states = inputs_embeds + embed_pos
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+ # expand attention_mask
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+ encoder_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ # check if head_mask has a correct number of layers specified if desired
+ if head_mask is not None:
+ if head_mask.size()[0] != (len(self.layers)):
+ raise ValueError(
+ f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+ f" {head_mask.size()[0]}."
+ )
+
+ for idx, encoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs, output_attentions)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(encoder_layer),
+ hidden_states,
+ attention_mask,
+ (head_mask[idx] if head_mask is not None else None),
+ )
+ else:
+ layer_outputs = encoder_layer(
+ hidden_states,
+ attention_mask,
+ layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ hidden_states = self.layer_norm(hidden_states)
+
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+ )
+
+
+class LDMBertModel(LDMBertPreTrainedModel):
+ _no_split_modules = []
+
+ def __init__(self, config: LDMBertConfig):
+ super().__init__(config)
+ self.model = LDMBertEncoder(config)
+ self.to_logits = nn.Linear(config.hidden_size, config.vocab_size)
+
+ def forward(
+ self,
+ input_ids=None,
+ attention_mask=None,
+ position_ids=None,
+ head_mask=None,
+ inputs_embeds=None,
+ output_attentions=None,
+ output_hidden_states=None,
+ return_dict=None,
+ ):
+ outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ return outputs
diff --git a/src/model/TextGen/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/model/TextGen/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
new file mode 100644
index 0000000..044ff35
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
@@ -0,0 +1,169 @@
+import inspect
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+
+import PIL
+
+from ...models import UNet2DModel, VQModel
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...schedulers import (
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+)
+
+
+def preprocess(image):
+ w, h = image.size
+ w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
+ image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+ image = np.array(image).astype(np.float32) / 255.0
+ image = image[None].transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image)
+ return 2.0 * image - 1.0
+
+
+class LDMSuperResolutionPipeline(DiffusionPipeline):
+ r"""
+ A pipeline for image super-resolution using Latent
+
+ This class inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Parameters:
+ vqvae ([`VQModel`]):
+ Vector-quantized (VQ) VAE Model to encode and decode images to and from latent representations.
+ unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`],
+ [`EulerAncestralDiscreteScheduler`], [`DPMSolverMultistepScheduler`], or [`PNDMScheduler`].
+ """
+
+ def __init__(
+ self,
+ vqvae: VQModel,
+ unet: UNet2DModel,
+ scheduler: Union[
+ DDIMScheduler,
+ PNDMScheduler,
+ LMSDiscreteScheduler,
+ EulerDiscreteScheduler,
+ EulerAncestralDiscreteScheduler,
+ DPMSolverMultistepScheduler,
+ ],
+ ):
+ super().__init__()
+ self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ init_image: Union[torch.Tensor, PIL.Image.Image],
+ batch_size: Optional[int] = 1,
+ num_inference_steps: Optional[int] = 100,
+ eta: Optional[float] = 0.0,
+ generator: Optional[torch.Generator] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[Tuple, ImagePipelineOutput]:
+ r"""
+ Args:
+ init_image (`torch.Tensor` or `PIL.Image.Image`):
+ `Image`, or tensor representing an image batch, that will be used as the starting point for the
+ process.
+ batch_size (`int`, *optional*, defaults to 1):
+ Number of images to generate.
+ num_inference_steps (`int`, *optional*, defaults to 100):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+ `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+ generated images.
+ """
+
+ if isinstance(init_image, PIL.Image.Image):
+ batch_size = 1
+ elif isinstance(init_image, torch.Tensor):
+ batch_size = init_image.shape[0]
+ else:
+ raise ValueError(
+ f"`init_image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(init_image)}"
+ )
+
+ if isinstance(init_image, PIL.Image.Image):
+ init_image = preprocess(init_image)
+
+ height, width = init_image.shape[-2:]
+
+ # in_channels should be 6: 3 for latents, 3 for low resolution image
+ latents_shape = (batch_size, self.unet.in_channels // 2, height, width)
+ latents_dtype = next(self.unet.parameters()).dtype
+
+ if self.device.type == "mps":
+ # randn does not work reproducibly on mps
+ latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype)
+ latents = latents.to(self.device)
+ else:
+ latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+
+ init_image = init_image.to(device=self.device, dtype=latents_dtype)
+
+ # set timesteps and move to the correct device
+ self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+ timesteps_tensor = self.scheduler.timesteps
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature.
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_kwargs = {}
+ if accepts_eta:
+ extra_kwargs["eta"] = eta
+
+ for t in self.progress_bar(timesteps_tensor):
+ # concat latents and low resolution image in the channel dimension.
+ latents_input = torch.cat([latents, init_image], dim=1)
+ latents_input = self.scheduler.scale_model_input(latents_input, t)
+ # predict the noise residual
+ noise_pred = self.unet(latents_input, t).sample
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
+
+ # decode the image latents with the VQVAE
+ image = self.vqvae.decode(latents).sample
+ image = torch.clamp(image, -1.0, 1.0)
+ image = image / 2 + 0.5
+ image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image,)
+
+ return ImagePipelineOutput(images=image)
diff --git a/src/model/TextGen/diffusers/pipelines/latent_diffusion_uncond/__init__.py b/src/model/TextGen/diffusers/pipelines/latent_diffusion_uncond/__init__.py
new file mode 100644
index 0000000..0826ca7
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/latent_diffusion_uncond/__init__.py
@@ -0,0 +1,2 @@
+# flake8: noqa
+from .pipeline_latent_diffusion_uncond import LDMPipeline
diff --git a/src/model/TextGen/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/src/model/TextGen/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
new file mode 100644
index 0000000..5345c4e
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
@@ -0,0 +1,111 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ...models import UNet2DModel, VQModel
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...schedulers import DDIMScheduler
+
+
+class LDMPipeline(DiffusionPipeline):
+ r"""
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Parameters:
+ vqvae ([`VQModel`]):
+ Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
+ unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ [`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latents.
+ """
+
+ def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler):
+ super().__init__()
+ self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ batch_size: int = 1,
+ generator: Optional[torch.Generator] = None,
+ eta: float = 0.0,
+ num_inference_steps: int = 50,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[Tuple, ImagePipelineOutput]:
+ r"""
+ Args:
+ batch_size (`int`, *optional*, defaults to 1):
+ Number of images to generate.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+ `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+ generated images.
+ """
+
+ latents = torch.randn(
+ (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
+ generator=generator,
+ )
+ latents = latents.to(self.device)
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+
+ self.scheduler.set_timesteps(num_inference_steps)
+
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+
+ extra_kwargs = {}
+ if accepts_eta:
+ extra_kwargs["eta"] = eta
+
+ for t in self.progress_bar(self.scheduler.timesteps):
+ latent_model_input = self.scheduler.scale_model_input(latents, t)
+ # predict the noise residual
+ noise_prediction = self.unet(latent_model_input, t).sample
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_prediction, t, latents, **extra_kwargs).prev_sample
+
+ # decode the image latents with the VAE
+ image = self.vqvae.decode(latents).sample
+
+ image = (image / 2 + 0.5).clamp(0, 1)
+ image = image.cpu().permute(0, 2, 3, 1).numpy()
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image,)
+
+ return ImagePipelineOutput(images=image)
diff --git a/src/model/TextGen/diffusers/pipelines/pndm/__init__.py b/src/model/TextGen/diffusers/pipelines/pndm/__init__.py
new file mode 100644
index 0000000..6fc46aa
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/pndm/__init__.py
@@ -0,0 +1,2 @@
+# flake8: noqa
+from .pipeline_pndm import PNDMPipeline
diff --git a/src/model/TextGen/diffusers/pipelines/pndm/pipeline_pndm.py b/src/model/TextGen/diffusers/pipelines/pndm/pipeline_pndm.py
new file mode 100644
index 0000000..ef7062d
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/pndm/pipeline_pndm.py
@@ -0,0 +1,96 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ...models import UNet2DModel
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...schedulers import PNDMScheduler
+
+
+class PNDMPipeline(DiffusionPipeline):
+ r"""
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Parameters:
+ unet (`UNet2DModel`): U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ The `PNDMScheduler` to be used in combination with `unet` to denoise the encoded image.
+ """
+
+ unet: UNet2DModel
+ scheduler: PNDMScheduler
+
+ def __init__(self, unet: UNet2DModel, scheduler: PNDMScheduler):
+ super().__init__()
+ self.register_modules(unet=unet, scheduler=scheduler)
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ batch_size: int = 1,
+ num_inference_steps: int = 50,
+ generator: Optional[torch.Generator] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[ImagePipelineOutput, Tuple]:
+ r"""
+ Args:
+ batch_size (`int`, `optional`, defaults to 1): The number of images to generate.
+ num_inference_steps (`int`, `optional`, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ generator (`torch.Generator`, `optional`): A [torch
+ generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ output_type (`str`, `optional`, defaults to `"pil"`): The output format of the generate image. Choose
+ between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, `optional`, defaults to `True`): Whether or not to return a
+ [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+ `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+ generated images.
+ """
+ # For more information on the sampling method you can take a look at Algorithm 2 of
+ # the official paper: https://arxiv.org/pdf/2202.09778.pdf
+
+ # Sample gaussian noise to begin loop
+ image = torch.randn(
+ (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
+ generator=generator,
+ )
+ image = image.to(self.device)
+
+ self.scheduler.set_timesteps(num_inference_steps)
+ for t in self.progress_bar(self.scheduler.timesteps):
+ model_output = self.unet(image, t).sample
+
+ image = self.scheduler.step(model_output, t, image).prev_sample
+
+ image = (image / 2 + 0.5).clamp(0, 1)
+ image = image.cpu().permute(0, 2, 3, 1).numpy()
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image,)
+
+ return ImagePipelineOutput(images=image)
diff --git a/src/model/TextGen/diffusers/pipelines/repaint/__init__.py b/src/model/TextGen/diffusers/pipelines/repaint/__init__.py
new file mode 100644
index 0000000..16bc86d
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/repaint/__init__.py
@@ -0,0 +1 @@
+from .pipeline_repaint import RePaintPipeline
diff --git a/src/model/TextGen/diffusers/pipelines/repaint/pipeline_repaint.py b/src/model/TextGen/diffusers/pipelines/repaint/pipeline_repaint.py
new file mode 100644
index 0000000..7af88f6
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/repaint/pipeline_repaint.py
@@ -0,0 +1,140 @@
+# Copyright 2022 ETH Zurich Computer Vision Lab and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+import PIL
+from tqdm.auto import tqdm
+
+from ...models import UNet2DModel
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...schedulers import RePaintScheduler
+
+
+def _preprocess_image(image: PIL.Image.Image):
+ image = np.array(image.convert("RGB"))
+ image = image[None].transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+ return image
+
+
+def _preprocess_mask(mask: PIL.Image.Image):
+ mask = np.array(mask.convert("L"))
+ mask = mask.astype(np.float32) / 255.0
+ mask = mask[None, None]
+ mask[mask < 0.5] = 0
+ mask[mask >= 0.5] = 1
+ mask = torch.from_numpy(mask)
+ return mask
+
+
+class RePaintPipeline(DiffusionPipeline):
+ unet: UNet2DModel
+ scheduler: RePaintScheduler
+
+ def __init__(self, unet, scheduler):
+ super().__init__()
+ self.register_modules(unet=unet, scheduler=scheduler)
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ original_image: Union[torch.FloatTensor, PIL.Image.Image],
+ mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+ num_inference_steps: int = 250,
+ eta: float = 0.0,
+ jump_length: int = 10,
+ jump_n_sample: int = 10,
+ generator: Optional[torch.Generator] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ ) -> Union[ImagePipelineOutput, Tuple]:
+ r"""
+ Args:
+ original_image (`torch.FloatTensor` or `PIL.Image.Image`):
+ The original image to inpaint on.
+ mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+ The mask_image where 0.0 values define which part of the original image to inpaint (change).
+ num_inference_steps (`int`, *optional*, defaults to 1000):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ eta (`float`):
+ The weight of noise for added noise in a diffusion step. Its value is between 0.0 and 1.0 - 0.0 is DDIM
+ and 1.0 is DDPM scheduler respectively.
+ jump_length (`int`, *optional*, defaults to 10):
+ The number of steps taken forward in time before going backward in time for a single jump ("j" in
+ RePaint paper). Take a look at Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
+ jump_n_sample (`int`, *optional*, defaults to 10):
+ The number of times we will make forward time jump for a given chosen time sample. Take a look at
+ Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+ `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+ generated images.
+ """
+
+ if not isinstance(original_image, torch.FloatTensor):
+ original_image = _preprocess_image(original_image)
+ original_image = original_image.to(self.device)
+ if not isinstance(mask_image, torch.FloatTensor):
+ mask_image = _preprocess_mask(mask_image)
+ mask_image = mask_image.to(self.device)
+
+ # sample gaussian noise to begin the loop
+ image = torch.randn(
+ original_image.shape,
+ generator=generator,
+ device=self.device,
+ )
+ image = image.to(self.device)
+
+ # set step values
+ self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample, self.device)
+ self.scheduler.eta = eta
+
+ t_last = self.scheduler.timesteps[0] + 1
+ for i, t in enumerate(tqdm(self.scheduler.timesteps)):
+ if t < t_last:
+ # predict the noise residual
+ model_output = self.unet(image, t).sample
+ # compute previous image: x_t -> x_t-1
+ image = self.scheduler.step(model_output, t, image, original_image, mask_image, generator).prev_sample
+
+ else:
+ # compute the reverse: x_t-1 -> x_t
+ image = self.scheduler.undo_step(image, t_last, generator)
+ t_last = t
+
+ image = (image / 2 + 0.5).clamp(0, 1)
+ image = image.cpu().permute(0, 2, 3, 1).numpy()
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image,)
+
+ return ImagePipelineOutput(images=image)
diff --git a/src/model/TextGen/diffusers/pipelines/score_sde_ve/__init__.py b/src/model/TextGen/diffusers/pipelines/score_sde_ve/__init__.py
new file mode 100644
index 0000000..000d61f
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/score_sde_ve/__init__.py
@@ -0,0 +1,2 @@
+# flake8: noqa
+from .pipeline_score_sde_ve import ScoreSdeVePipeline
diff --git a/src/model/TextGen/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py b/src/model/TextGen/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
new file mode 100644
index 0000000..7eb6a5d
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
@@ -0,0 +1,101 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ...models import UNet2DModel
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...schedulers import ScoreSdeVeScheduler
+
+
+class ScoreSdeVePipeline(DiffusionPipeline):
+ r"""
+ Parameters:
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+ unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image. scheduler ([`SchedulerMixin`]):
+ The [`ScoreSdeVeScheduler`] scheduler to be used in combination with `unet` to denoise the encoded image.
+ """
+ unet: UNet2DModel
+ scheduler: ScoreSdeVeScheduler
+
+ def __init__(self, unet: UNet2DModel, scheduler: DiffusionPipeline):
+ super().__init__()
+ self.register_modules(unet=unet, scheduler=scheduler)
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ batch_size: int = 1,
+ num_inference_steps: int = 2000,
+ generator: Optional[torch.Generator] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[ImagePipelineOutput, Tuple]:
+ r"""
+ Args:
+ batch_size (`int`, *optional*, defaults to 1):
+ The number of images to generate.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+ `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+ generated images.
+ """
+
+ img_size = self.unet.config.sample_size
+ shape = (batch_size, 3, img_size, img_size)
+
+ model = self.unet
+
+ sample = torch.randn(*shape, generator=generator) * self.scheduler.init_noise_sigma
+ sample = sample.to(self.device)
+
+ self.scheduler.set_timesteps(num_inference_steps)
+ self.scheduler.set_sigmas(num_inference_steps)
+
+ for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+ sigma_t = self.scheduler.sigmas[i] * torch.ones(shape[0], device=self.device)
+
+ # correction step
+ for _ in range(self.scheduler.config.correct_steps):
+ model_output = self.unet(sample, sigma_t).sample
+ sample = self.scheduler.step_correct(model_output, sample, generator=generator).prev_sample
+
+ # prediction step
+ model_output = model(sample, sigma_t).sample
+ output = self.scheduler.step_pred(model_output, t, sample, generator=generator)
+
+ sample, sample_mean = output.prev_sample, output.prev_sample_mean
+
+ sample = sample_mean.clamp(0, 1)
+ sample = sample.cpu().permute(0, 2, 3, 1).numpy()
+ if output_type == "pil":
+ sample = self.numpy_to_pil(sample)
+
+ if not return_dict:
+ return (sample,)
+
+ return ImagePipelineOutput(images=sample)
diff --git a/src/model/TextGen/diffusers/pipelines/stable_diffusion/README.md b/src/model/TextGen/diffusers/pipelines/stable_diffusion/README.md
new file mode 100644
index 0000000..a76e4c6
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stable_diffusion/README.md
@@ -0,0 +1,176 @@
+# Stable Diffusion
+
+## Overview
+
+Stable Diffusion was proposed in [Stable Diffusion Announcement](https://stability.ai/blog/stable-diffusion-announcement) by Patrick Esser and Robin Rombach and the Stability AI team.
+
+The summary of the model is the following:
+
+*Stable Diffusion is a text-to-image model that will empower billions of people to create stunning art within seconds. It is a breakthrough in speed and quality meaning that it can run on consumer GPUs. You can see some of the amazing output that has been created by this model without pre or post-processing on this page. The model itself builds upon the work of the team at CompVis and Runway in their widely used latent diffusion model combined with insights from the conditional diffusion models by our lead generative AI developer Katherine Crowson, Dall-E 2 by Open AI, Imagen by Google Brain and many others. We are delighted that AI media generation is a cooperative field and hope it can continue this way to bring the gift of creativity to all.*
+
+## Tips:
+
+- Stable Diffusion has the same architecture as [Latent Diffusion](https://arxiv.org/abs/2112.10752) but uses a frozen CLIP Text Encoder instead of training the text encoder jointly with the diffusion model.
+- An in-detail explanation of the Stable Diffusion model can be found under [Stable Diffusion with 🧨 Diffusers](https://huggingface.co/blog/stable_diffusion).
+- If you don't want to rely on the Hugging Face Hub and having to pass a authentication token, you can
+download the weights with `git lfs install; git clone https://huggingface.co/runwayml/stable-diffusion-v1-5` and instead pass the local path to the cloned folder to `from_pretrained` as shown below.
+- Stable Diffusion can work with a variety of different samplers as is shown below.
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_stable_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py) | *Text-to-Image Generation* | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [pipeline_stable_diffusion_img2img](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) | *Image-to-Image Text-Guided Generation* | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [pipeline_stable_diffusion_inpaint](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | *Text-Guided Image Inpainting* | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+
+## Examples:
+
+### Using Stable Diffusion without being logged into the Hub.
+
+If you want to download the model weights using a single Python line, you need to be logged in via `huggingface-cli login`.
+
+```python
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+```
+
+This however can make it difficult to build applications on top of `diffusers` as you will always have to pass the token around. A potential way to solve this issue is by downloading the weights to a local path `"./stable-diffusion-v1-5"`:
+
+```
+git lfs install
+git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
+```
+
+and simply passing the local path to `from_pretrained`:
+
+```python
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
+```
+
+### Text-to-Image with default PLMS scheduler
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).sample[0]
+
+image.save("astronaut_rides_horse.png")
+```
+
+### Text-to-Image with DDIM scheduler
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline, DDIMScheduler
+
+scheduler = DDIMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+
+pipe = StableDiffusionPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+ scheduler=scheduler,
+).to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).sample[0]
+
+image.save("astronaut_rides_horse.png")
+```
+
+### Text-to-Image with K-LMS scheduler
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
+
+lms = LMSDiscreteScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+
+pipe = StableDiffusionPipeline.from_pretrained(
+ "runwayml/stable-diffusion-v1-5",
+ scheduler=lms,
+).to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).sample[0]
+
+image.save("astronaut_rides_horse.png")
+```
+
+### CycleDiffusion using Stable Diffusion and DDIM scheduler
+
+```python
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+
+from diffusers import CycleDiffusionPipeline, DDIMScheduler
+
+
+# load the scheduler. CycleDiffusion only supports stochastic schedulers.
+
+# load the pipeline
+# make sure you're logged in with `huggingface-cli login`
+model_id_or_path = "CompVis/stable-diffusion-v1-4"
+scheduler = DDIMScheduler.from_config(model_id_or_path, subfolder="scheduler")
+pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")
+
+# let's download an initial image
+url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+init_image.save("horse.png")
+
+# let's specify a prompt
+source_prompt = "An astronaut riding a horse"
+prompt = "An astronaut riding an elephant"
+
+# call the pipeline
+image = pipe(
+ prompt=prompt,
+ source_prompt=source_prompt,
+ init_image=init_image,
+ num_inference_steps=100,
+ eta=0.1,
+ strength=0.8,
+ guidance_scale=2,
+ source_guidance_scale=1,
+).images[0]
+
+image.save("horse_to_elephant.png")
+
+# let's try another example
+# See more samples at the original repo: https://github.com/ChenWu98/cycle-diffusion
+url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+init_image.save("black.png")
+
+source_prompt = "A black colored car"
+prompt = "A blue colored car"
+
+# call the pipeline
+torch.manual_seed(0)
+image = pipe(
+ prompt=prompt,
+ source_prompt=source_prompt,
+ init_image=init_image,
+ num_inference_steps=100,
+ eta=0.1,
+ strength=0.85,
+ guidance_scale=3,
+ source_guidance_scale=1,
+).images[0]
+
+image.save("black_to_blue.png")
+```
diff --git a/src/model/TextGen/diffusers/pipelines/stable_diffusion/__init__.py b/src/model/TextGen/diffusers/pipelines/stable_diffusion/__init__.py
new file mode 100644
index 0000000..6623929
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stable_diffusion/__init__.py
@@ -0,0 +1,65 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+
+import PIL
+from PIL import Image
+
+from ...utils import BaseOutput, is_flax_available, is_onnx_available, is_torch_available, is_transformers_available
+
+
+@dataclass
+class StableDiffusionPipelineOutput(BaseOutput):
+ """
+ Output class for Stable Diffusion pipelines.
+
+ Args:
+ images (`List[PIL.Image.Image]` or `np.ndarray`)
+ List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+ num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+ nsfw_content_detected (`List[bool]`)
+ List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, or `None` if safety checking could not be performed.
+ """
+
+ images: Union[List[PIL.Image.Image], np.ndarray]
+ nsfw_content_detected: Optional[List[bool]]
+
+
+if is_transformers_available() and is_torch_available():
+ from .pipeline_cycle_diffusion import CycleDiffusionPipeline
+ from .pipeline_stable_diffusion import StableDiffusionPipeline
+ from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
+ from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
+ from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
+ from .safety_checker import StableDiffusionSafetyChecker
+
+if is_transformers_available() and is_onnx_available():
+ from .pipeline_onnx_stable_diffusion import OnnxStableDiffusionPipeline, StableDiffusionOnnxPipeline
+ from .pipeline_onnx_stable_diffusion_img2img import OnnxStableDiffusionImg2ImgPipeline
+ from .pipeline_onnx_stable_diffusion_inpaint import OnnxStableDiffusionInpaintPipeline
+
+if is_transformers_available() and is_flax_available():
+ import flax
+
+ @flax.struct.dataclass
+ class FlaxStableDiffusionPipelineOutput(BaseOutput):
+ """
+ Output class for Stable Diffusion pipelines.
+
+ Args:
+ images (`List[PIL.Image.Image]` or `np.ndarray`)
+ List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+ num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+ nsfw_content_detected (`List[bool]`)
+ List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content.
+ """
+
+ images: Union[List[PIL.Image.Image], np.ndarray]
+ nsfw_content_detected: List[bool]
+
+ from ...schedulers.scheduling_pndm_flax import PNDMSchedulerState
+ from .pipeline_flax_stable_diffusion import FlaxStableDiffusionPipeline
+ from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
diff --git a/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
new file mode 100644
index 0000000..50f519c
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -0,0 +1,575 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import DDIMScheduler
+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+def preprocess(image):
+ w, h = image.size
+ w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
+ image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+ image = np.array(image).astype(np.float32) / 255.0
+ image = image[None].transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image)
+ return 2.0 * image - 1.0
+
+
+def posterior_sample(scheduler, latents, timestep, clean_latents, generator, eta):
+ # 1. get previous step value (=t-1)
+ prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
+
+ if prev_timestep <= 0:
+ return clean_latents
+
+ # 2. compute alphas, betas
+ alpha_prod_t = scheduler.alphas_cumprod[timestep]
+ alpha_prod_t_prev = (
+ scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+ )
+
+ variance = scheduler._get_variance(timestep, prev_timestep)
+ std_dev_t = eta * variance ** (0.5)
+
+ # direction pointing to x_t
+ e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5)
+ dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t
+ noise = std_dev_t * torch.randn(
+ clean_latents.shape, dtype=clean_latents.dtype, device=clean_latents.device, generator=generator
+ )
+ prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise
+
+ return prev_latents
+
+
+def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
+ # 1. get previous step value (=t-1)
+ prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
+
+ # 2. compute alphas, betas
+ alpha_prod_t = scheduler.alphas_cumprod[timestep]
+ alpha_prod_t_prev = (
+ scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+ )
+
+ beta_prod_t = 1 - alpha_prod_t
+
+ # 3. compute predicted original sample from predicted noise also called
+ # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+ pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+ # 4. Clip "predicted x_0"
+ if scheduler.config.clip_sample:
+ pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+ # 5. compute variance: "sigma_t(η)" -> see formula (16)
+ # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+ variance = scheduler._get_variance(timestep, prev_timestep)
+ std_dev_t = eta * variance ** (0.5)
+
+ # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+ pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
+
+ noise = (prev_latents - (alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction)) / (
+ variance ** (0.5) * eta
+ )
+ return noise
+
+
+class CycleDiffusionPipeline(DiffusionPipeline):
+ r"""
+ Pipeline for text-guided image to image generation using Stable Diffusion.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+ feature_extractor ([`CLIPFeatureExtractor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: DDIMScheduler,
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
+ super().__init__()
+
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["steps_offset"] = 1
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if safety_checker is None:
+ logger.warn(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+
+ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+ r"""
+ Enable sliced attention computation.
+
+ When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+ in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+ Args:
+ slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+ When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+ a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+ `attention_head_dim` must be a multiple of `slice_size`.
+ """
+ if slice_size == "auto":
+ # half the attention head size is usually a good trade-off between
+ # speed and memory
+ slice_size = self.unet.config.attention_head_dim // 2
+ self.unet.set_attention_slice(slice_size)
+
+ def disable_attention_slicing(self):
+ r"""
+ Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+ back to computing attention in one step.
+ """
+ # set slice_size = `None` to disable `set_attention_slice`
+ self.enable_attention_slicing(None)
+
+ @property
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+ def _execution_device(self):
+ r"""
+ Returns the device on which the pipeline's models will be executed. After calling
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+ hooks.
+ """
+ if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+ return self.device
+ for module in self.unet.modules():
+ if (
+ hasattr(module, "_hf_hook")
+ and hasattr(module._hf_hook, "execution_device")
+ and module._hf_hook.execution_device is not None
+ ):
+ return torch.device(module._hf_hook.execution_device)
+ return self.device
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `list(int)`):
+ prompt to be encoded
+ device: (`torch.device`):
+ torch device
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ """
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+
+ if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+ removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+ text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ bs_embed, seq_len, _ = text_embeddings.shape
+ text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+ text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt]
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ max_length = text_input_ids.shape[-1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = uncond_embeddings.shape[1]
+ uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+ uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+ return text_embeddings
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]],
+ source_prompt: Union[str, List[str]],
+ init_image: Union[torch.FloatTensor, PIL.Image.Image],
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ source_guidance_scale: Optional[float] = 1,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.1,
+ generator: Optional[torch.Generator] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`):
+ The prompt or prompts to guide the image generation.
+ init_image (`torch.FloatTensor` or `PIL.Image.Image`):
+ `Image`, or tensor representing an image batch, that will be used as the starting point for the
+ process.
+ strength (`float`, *optional*, defaults to 0.8):
+ Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+ `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+ number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+ noise will be maximum and the denoising process will run for the full number of iterations specified in
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference. This parameter will be modulated by `strength`.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ source_guidance_scale (`float`, *optional*, defaults to 1):
+ Guidance scale for the source prompt. This is useful to control the amount of influence the source
+ prompt for encoding.
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.1):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ if isinstance(prompt, str):
+ batch_size = 1
+ elif isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if batch_size != 1:
+ raise ValueError(
+ "At the moment only `batch_size=1` is supported for prompts, but you seem to have passed multiple"
+ f" prompts: {prompt}. Please make sure to pass only a single prompt."
+ )
+
+ if strength < 0 or strength > 1:
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ # set timesteps
+ self.scheduler.set_timesteps(num_inference_steps)
+
+ if isinstance(init_image, PIL.Image.Image):
+ init_image = preprocess(init_image)
+
+ device = self._execution_device
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ text_embeddings = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance, None)
+ source_text_embeddings = self._encode_prompt(
+ source_prompt, device, num_images_per_prompt, do_classifier_free_guidance, None
+ )
+
+ # encode the init image into latents and scale the latents
+ latents_dtype = text_embeddings.dtype
+ init_image = init_image.to(device=self.device, dtype=latents_dtype)
+ init_latent_dist = self.vae.encode(init_image).latent_dist
+ init_latents = init_latent_dist.sample(generator=generator)
+ init_latents = 0.18215 * init_latents
+
+ if isinstance(prompt, str):
+ prompt = [prompt]
+ if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0:
+ # expand init_latents for batch_size
+ deprecation_message = (
+ f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+ " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
+ " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+ " your script to pass as many init images as text prompts to suppress this warning."
+ )
+ deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
+ additional_image_per_prompt = len(prompt) // init_latents.shape[0]
+ init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
+ elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
+ raise ValueError(
+ f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
+ )
+ else:
+ init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
+
+ # get the original timestep using init_timestep
+ offset = self.scheduler.config.get("steps_offset", 0)
+ init_timestep = int(num_inference_steps * strength) + offset
+ init_timestep = min(init_timestep, num_inference_steps)
+
+ timesteps = self.scheduler.timesteps[-init_timestep]
+ timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)
+
+ # add noise to latents using the timesteps
+ noise = torch.randn(init_latents.shape, generator=generator, device=self.device, dtype=latents_dtype)
+ clean_latents = init_latents
+ init_latents = self.scheduler.add_noise(init_latents, noise, timesteps)
+
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+
+ if not (accepts_eta and (0 < eta <= 1)):
+ raise ValueError(
+ "Currently, only the DDIM scheduler is supported. Please make sure that `pipeline.scheduler` is of"
+ f" type {DDIMScheduler.__class__} and not {self.scheduler.__class__}."
+ )
+
+ extra_step_kwargs["eta"] = eta
+
+ latents = init_latents
+ source_latents = init_latents
+
+ t_start = max(num_inference_steps - init_timestep + offset, 0)
+
+ # Some schedulers like PNDM have timesteps as arrays
+ # It's more optimized to move all timesteps to correct device beforehand
+ timesteps = self.scheduler.timesteps[t_start:].to(self.device)
+
+ for i, t in enumerate(self.progress_bar(timesteps)):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2)
+ source_latent_model_input = torch.cat([source_latents] * 2)
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+ source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
+
+ # predict the noise residual
+ concat_latent_model_input = torch.stack(
+ [
+ source_latent_model_input[0],
+ latent_model_input[0],
+ source_latent_model_input[1],
+ latent_model_input[1],
+ ],
+ dim=0,
+ )
+ concat_text_embeddings = torch.stack(
+ [
+ source_text_embeddings[0],
+ text_embeddings[0],
+ source_text_embeddings[1],
+ text_embeddings[1],
+ ],
+ dim=0,
+ )
+ concat_noise_pred = self.unet(
+ concat_latent_model_input, t, encoder_hidden_states=concat_text_embeddings
+ ).sample
+
+ # perform guidance
+ (
+ source_noise_pred_uncond,
+ noise_pred_uncond,
+ source_noise_pred_text,
+ noise_pred_text,
+ ) = concat_noise_pred.chunk(4, dim=0)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+ source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
+ source_noise_pred_text - source_noise_pred_uncond
+ )
+
+ # Sample source_latents from the posterior distribution.
+ prev_source_latents = posterior_sample(
+ self.scheduler, source_latents, t, clean_latents, generator=generator, **extra_step_kwargs
+ )
+ # Compute noise.
+ noise = compute_noise(
+ self.scheduler, prev_source_latents, source_latents, t, source_noise_pred, **extra_step_kwargs
+ )
+ source_latents = prev_source_latents
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(
+ noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs
+ ).prev_sample
+
+ # call the callback, if provided
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ latents = 1 / 0.18215 * latents
+ image = self.vae.decode(latents).sample
+
+ image = (image / 2 + 0.5).clamp(0, 1)
+ image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+ if self.safety_checker is not None:
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+ self.device
+ )
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+ )
+ else:
+ has_nsfw_concept = None
+
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
new file mode 100644
index 0000000..0294399
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -0,0 +1,352 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict
+from flax.jax_utils import unreplicate
+from flax.training.common_utils import shard
+from PIL import Image
+from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel
+
+from ...models import FlaxAutoencoderKL, FlaxUNet2DConditionModel
+from ...pipeline_flax_utils import FlaxDiffusionPipeline
+from ...schedulers import (
+ FlaxDDIMScheduler,
+ FlaxDPMSolverMultistepScheduler,
+ FlaxLMSDiscreteScheduler,
+ FlaxPNDMScheduler,
+)
+from ...utils import logging
+from . import FlaxStableDiffusionPipelineOutput
+from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
+ r"""
+ Pipeline for text-to-image generation using Stable Diffusion.
+
+ This model inherits from [`FlaxDiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Args:
+ vae ([`FlaxAutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`FlaxCLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.FlaxCLIPTextModel),
+ specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`FlaxUNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
+ [`FlaxDPMSolverMultistepScheduler`].
+ safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPFeatureExtractor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+
+ def __init__(
+ self,
+ vae: FlaxAutoencoderKL,
+ text_encoder: FlaxCLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: FlaxUNet2DConditionModel,
+ scheduler: Union[
+ FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
+ ],
+ safety_checker: FlaxStableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ dtype: jnp.dtype = jnp.float32,
+ ):
+ super().__init__()
+ self.dtype = dtype
+
+ if safety_checker is None:
+ logger.warn(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+
+ def prepare_inputs(self, prompt: Union[str, List[str]]):
+ if not isinstance(prompt, (str, list)):
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ text_input = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ truncation=True,
+ return_tensors="np",
+ )
+ return text_input.input_ids
+
+ def _get_has_nsfw_concepts(self, features, params):
+ has_nsfw_concepts = self.safety_checker(features, params)
+ return has_nsfw_concepts
+
+ def _run_safety_checker(self, images, safety_model_params, jit=False):
+ # safety_model_params should already be replicated when jit is True
+ pil_images = [Image.fromarray(image) for image in images]
+ features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
+
+ if jit:
+ features = shard(features)
+ has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
+ has_nsfw_concepts = unshard(has_nsfw_concepts)
+ safety_model_params = unreplicate(safety_model_params)
+ else:
+ has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
+
+ images_was_copied = False
+ for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+ if has_nsfw_concept:
+ if not images_was_copied:
+ images_was_copied = True
+ images = images.copy()
+
+ images[idx] = np.zeros(images[idx].shape, dtype=np.uint8) # black image
+
+ if any(has_nsfw_concepts):
+ warnings.warn(
+ "Potential NSFW content was detected in one or more images. A black image will be returned"
+ " instead. Try again with a different prompt and/or seed."
+ )
+
+ return images, has_nsfw_concepts
+
+ def _generate(
+ self,
+ prompt_ids: jnp.array,
+ params: Union[Dict, FrozenDict],
+ prng_seed: jax.random.PRNGKey,
+ num_inference_steps: int = 50,
+ height: int = 512,
+ width: int = 512,
+ guidance_scale: float = 7.5,
+ latents: Optional[jnp.array] = None,
+ debug: bool = False,
+ ):
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+ # get prompt text embeddings
+ text_embeddings = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+
+ # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
+ # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
+ batch_size = prompt_ids.shape[0]
+
+ max_length = prompt_ids.shape[-1]
+ uncond_input = self.tokenizer(
+ [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
+ )
+ uncond_embeddings = self.text_encoder(uncond_input.input_ids, params=params["text_encoder"])[0]
+ context = jnp.concatenate([uncond_embeddings, text_embeddings])
+
+ latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
+ if latents is None:
+ latents = jax.random.normal(prng_seed, shape=latents_shape, dtype=jnp.float32)
+ else:
+ if latents.shape != latents_shape:
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+ def loop_body(step, args):
+ latents, scheduler_state = args
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ latents_input = jnp.concatenate([latents] * 2)
+
+ t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+ timestep = jnp.broadcast_to(t, latents_input.shape[0])
+
+ latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
+
+ # predict the noise residual
+ noise_pred = self.unet.apply(
+ {"params": params["unet"]},
+ jnp.array(latents_input),
+ jnp.array(timestep, dtype=jnp.int32),
+ encoder_hidden_states=context,
+ ).sample
+ # perform guidance
+ noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+ return latents, scheduler_state
+
+ scheduler_state = self.scheduler.set_timesteps(
+ params["scheduler"], num_inference_steps=num_inference_steps, shape=latents.shape
+ )
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+
+ if debug:
+ # run with python for loop
+ for i in range(num_inference_steps):
+ latents, scheduler_state = loop_body(i, (latents, scheduler_state))
+ else:
+ latents, _ = jax.lax.fori_loop(0, num_inference_steps, loop_body, (latents, scheduler_state))
+
+ # scale and decode the image latents with vae
+ latents = 1 / 0.18215 * latents
+ image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
+
+ image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+ return image
+
+ def __call__(
+ self,
+ prompt_ids: jnp.array,
+ params: Union[Dict, FrozenDict],
+ prng_seed: jax.random.PRNGKey,
+ num_inference_steps: int = 50,
+ height: int = 512,
+ width: int = 512,
+ guidance_scale: float = 7.5,
+ latents: jnp.array = None,
+ return_dict: bool = True,
+ jit: bool = False,
+ debug: bool = False,
+ **kwargs,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`):
+ The prompt or prompts to guide the image generation.
+ height (`int`, *optional*, defaults to 512):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to 512):
+ The width in pixels of the generated image.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ latents (`jnp.array`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ jit (`bool`, defaults to `False`):
+ Whether to run `pmap` versions of the generation and safety scoring functions. NOTE: This argument
+ exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a future release.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
+ a plain tuple.
+
+ Returns:
+ [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a
+ `tuple. When returning a tuple, the first element is a list with the generated images, and the second
+ element is a list of `bool`s denoting whether the corresponding generated image likely represents
+ "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
+ """
+ if jit:
+ images = _p_generate(
+ self, prompt_ids, params, prng_seed, num_inference_steps, height, width, guidance_scale, latents, debug
+ )
+ else:
+ images = self._generate(
+ prompt_ids, params, prng_seed, num_inference_steps, height, width, guidance_scale, latents, debug
+ )
+
+ if self.safety_checker is not None:
+ safety_params = params["safety_checker"]
+ images_uint8_casted = (images * 255).round().astype("uint8")
+ num_devices, batch_size = images.shape[:2]
+
+ images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
+ images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
+ images = np.asarray(images)
+
+ # block images
+ if any(has_nsfw_concept):
+ for i, is_nsfw in enumerate(has_nsfw_concept):
+ if is_nsfw:
+ images[i] = np.asarray(images_uint8_casted[i])
+
+ images = images.reshape(num_devices, batch_size, height, width, 3)
+ else:
+ has_nsfw_concept = False
+
+ if not return_dict:
+ return (images, has_nsfw_concept)
+
+ return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
+
+
+# TODO: maybe use a config dict instead of so many static argnums
+@partial(jax.pmap, static_broadcasted_argnums=(0, 4, 5, 6, 7, 9))
+def _p_generate(
+ pipe, prompt_ids, params, prng_seed, num_inference_steps, height, width, guidance_scale, latents, debug
+):
+ return pipe._generate(
+ prompt_ids, params, prng_seed, num_inference_steps, height, width, guidance_scale, latents, debug
+ )
+
+
+@partial(jax.pmap, static_broadcasted_argnums=(0,))
+def _p_get_has_nsfw_concepts(pipe, features, params):
+ return pipe._get_has_nsfw_concepts(features, params)
+
+
+def unshard(x: jnp.ndarray):
+ # einops.rearrange(x, 'd b ... -> (d b) ...')
+ num_devices, batch_size = x.shape[:2]
+ rest = x.shape[2:]
+ return x.reshape(num_devices * batch_size, *rest)
diff --git a/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
new file mode 100644
index 0000000..d1e2704
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -0,0 +1,330 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+from transformers import CLIPFeatureExtractor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)
+
+
+class OnnxStableDiffusionPipeline(DiffusionPipeline):
+ vae_encoder: OnnxRuntimeModel
+ vae_decoder: OnnxRuntimeModel
+ text_encoder: OnnxRuntimeModel
+ tokenizer: CLIPTokenizer
+ unet: OnnxRuntimeModel
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+ safety_checker: OnnxRuntimeModel
+ feature_extractor: CLIPFeatureExtractor
+
+ def __init__(
+ self,
+ vae_encoder: OnnxRuntimeModel,
+ vae_decoder: OnnxRuntimeModel,
+ text_encoder: OnnxRuntimeModel,
+ tokenizer: CLIPTokenizer,
+ unet: OnnxRuntimeModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ safety_checker: OnnxRuntimeModel,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
+ super().__init__()
+
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["steps_offset"] = 1
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+ )
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["clip_sample"] = False
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ self.register_modules(
+ vae_encoder=vae_encoder,
+ vae_decoder=vae_decoder,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+
+ def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `list(int)`):
+ prompt to be encoded
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ """
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+ # get prompt text embeddings
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ return_tensors="np",
+ )
+ text_input_ids = text_inputs.input_ids
+
+ if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+ removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+ text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+ text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt] * batch_size
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ max_length = text_input_ids.shape[-1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="np",
+ )
+ uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+ uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+
+ return text_embeddings
+
+ def __call__(
+ self,
+ prompt: Union[str, List[str]],
+ height: Optional[int] = 512,
+ width: Optional[int] = 512,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[np.random.RandomState] = None,
+ latents: Optional[np.ndarray] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
+ if isinstance(prompt, str):
+ batch_size = 1
+ elif isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ if generator is None:
+ generator = np.random
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ text_embeddings = self._encode_prompt(
+ prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+ )
+
+ # get the initial random noise unless the user supplied it
+ latents_dtype = text_embeddings.dtype
+ latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, width // 8)
+ if latents is None:
+ latents = generator.randn(*latents_shape).astype(latents_dtype)
+ elif latents.shape != latents_shape:
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+ # set timesteps
+ self.scheduler.set_timesteps(num_inference_steps)
+
+ latents = latents * np.float(self.scheduler.init_noise_sigma)
+
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ timestep_dtype = next(
+ (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+ )
+ timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+ for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+ latent_model_input = latent_model_input.cpu().numpy()
+
+ # predict the noise residual
+ timestep = np.array([t], dtype=timestep_dtype)
+ noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=text_embeddings)
+ noise_pred = noise_pred[0]
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, torch.from_numpy(latents), **extra_step_kwargs).prev_sample
+ latents = np.array(latents)
+
+ # call the callback, if provided
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ latents = 1 / 0.18215 * latents
+ # image = self.vae_decoder(latent_sample=latents)[0]
+ # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+ image = np.concatenate(
+ [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+ )
+
+ image = np.clip(image / 2 + 0.5, 0, 1)
+ image = image.transpose((0, 2, 3, 1))
+
+ if self.safety_checker is not None:
+ safety_checker_input = self.feature_extractor(
+ self.numpy_to_pil(image), return_tensors="np"
+ ).pixel_values.astype(image.dtype)
+
+ image, has_nsfw_concepts = self.safety_checker(clip_input=safety_checker_input, images=image)
+
+ # There will throw an error if use safety_checker batchsize>1
+ images, has_nsfw_concept = [], []
+ for i in range(image.shape[0]):
+ image_i, has_nsfw_concept_i = self.safety_checker(
+ clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+ )
+ images.append(image_i)
+ has_nsfw_concept.append(has_nsfw_concept_i[0])
+ image = np.concatenate(images)
+ else:
+ has_nsfw_concept = None
+
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+
+class StableDiffusionOnnxPipeline(OnnxStableDiffusionPipeline):
+ def __init__(
+ self,
+ vae_encoder: OnnxRuntimeModel,
+ vae_decoder: OnnxRuntimeModel,
+ text_encoder: OnnxRuntimeModel,
+ tokenizer: CLIPTokenizer,
+ unet: OnnxRuntimeModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ safety_checker: OnnxRuntimeModel,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
+ deprecation_message = "Please use `OnnxStableDiffusionPipeline` instead of `StableDiffusionOnnxPipeline`."
+ deprecate("StableDiffusionOnnxPipeline", "1.0.0", deprecation_message)
+ super().__init__(
+ vae_encoder=vae_encoder,
+ vae_decoder=vae_decoder,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
diff --git a/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
new file mode 100644
index 0000000..a09dfe7
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -0,0 +1,441 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from transformers import CLIPFeatureExtractor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+def preprocess(image):
+ w, h = image.size
+ w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
+ image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+ image = np.array(image).astype(np.float32) / 255.0
+ image = image[None].transpose(0, 3, 1, 2)
+ return 2.0 * image - 1.0
+
+
+class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
+ r"""
+ Pipeline for text-guided image to image generation using Stable Diffusion.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPFeatureExtractor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+ vae_encoder: OnnxRuntimeModel
+ vae_decoder: OnnxRuntimeModel
+ text_encoder: OnnxRuntimeModel
+ tokenizer: CLIPTokenizer
+ unet: OnnxRuntimeModel
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+ safety_checker: OnnxRuntimeModel
+ feature_extractor: CLIPFeatureExtractor
+
+ def __init__(
+ self,
+ vae_encoder: OnnxRuntimeModel,
+ vae_decoder: OnnxRuntimeModel,
+ text_encoder: OnnxRuntimeModel,
+ tokenizer: CLIPTokenizer,
+ unet: OnnxRuntimeModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ safety_checker: OnnxRuntimeModel,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
+ super().__init__()
+
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["steps_offset"] = 1
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+ )
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["clip_sample"] = False
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if safety_checker is None:
+ logger.warning(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ self.register_modules(
+ vae_encoder=vae_encoder,
+ vae_decoder=vae_decoder,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
+ def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `list(int)`):
+ prompt to be encoded
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ """
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+ # get prompt text embeddings
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ return_tensors="np",
+ )
+ text_input_ids = text_inputs.input_ids
+
+ if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+ removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+ text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+ text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt] * batch_size
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ max_length = text_input_ids.shape[-1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="np",
+ )
+ uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+ uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+
+ return text_embeddings
+
+ def __call__(
+ self,
+ prompt: Union[str, List[str]],
+ init_image: Union[np.ndarray, PIL.Image.Image],
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[np.random.RandomState] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`):
+ The prompt or prompts to guide the image generation.
+ init_image (`np.ndarray` or `PIL.Image.Image`):
+ `Image`, or tensor representing an image batch, that will be used as the starting point for the
+ process.
+ strength (`float`, *optional*, defaults to 0.8):
+ Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+ `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+ number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+ noise will be maximum and the denoising process will run for the full number of iterations specified in
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference. This parameter will be modulated by `strength`.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`np.random.RandomState`, *optional*):
+ A np.random.RandomState to make generation deterministic.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ if isinstance(prompt, str):
+ batch_size = 1
+ elif isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if strength < 0 or strength > 1:
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ if generator is None:
+ generator = np.random
+
+ # set timesteps
+ self.scheduler.set_timesteps(num_inference_steps)
+
+ if isinstance(init_image, PIL.Image.Image):
+ init_image = preprocess(init_image)
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ text_embeddings = self._encode_prompt(
+ prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+ )
+
+ latents_dtype = text_embeddings.dtype
+ init_image = init_image.astype(latents_dtype)
+ # encode the init image into latents and scale the latents
+ init_latents = self.vae_encoder(sample=init_image)[0]
+ init_latents = 0.18215 * init_latents
+
+ if isinstance(prompt, str):
+ prompt = [prompt]
+ if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0:
+ # expand init_latents for batch_size
+ deprecation_message = (
+ f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+ " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
+ " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+ " your script to pass as many init images as text prompts to suppress this warning."
+ )
+ deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
+ additional_image_per_prompt = len(prompt) // init_latents.shape[0]
+ init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0)
+ elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
+ raise ValueError(
+ f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
+ )
+ else:
+ init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
+
+ # get the original timestep using init_timestep
+ offset = self.scheduler.config.get("steps_offset", 0)
+ init_timestep = int(num_inference_steps * strength) + offset
+ init_timestep = min(init_timestep, num_inference_steps)
+
+ timesteps = self.scheduler.timesteps.numpy()[-init_timestep]
+ timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
+
+ # add noise to latents using the timesteps
+ noise = generator.randn(*init_latents.shape).astype(latents_dtype)
+ init_latents = self.scheduler.add_noise(
+ torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
+ )
+ init_latents = init_latents.numpy()
+
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ latents = init_latents
+
+ t_start = max(num_inference_steps - init_timestep + offset, 0)
+ timesteps = self.scheduler.timesteps[t_start:].numpy()
+
+ timestep_dtype = next(
+ (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+ )
+ timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+ for i, t in enumerate(self.progress_bar(timesteps)):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+ latent_model_input = latent_model_input.cpu().numpy()
+
+ # predict the noise residual
+ timestep = np.array([t], dtype=timestep_dtype)
+ noise_pred = self.unet(
+ sample=latent_model_input, timestep=timestep, encoder_hidden_states=text_embeddings
+ )[0]
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, torch.from_numpy(latents), **extra_step_kwargs).prev_sample
+ latents = latents.numpy()
+
+ # call the callback, if provided
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ latents = 1 / 0.18215 * latents
+ # image = self.vae_decoder(latent_sample=latents)[0]
+ # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+ image = np.concatenate(
+ [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+ )
+
+ image = np.clip(image / 2 + 0.5, 0, 1)
+ image = image.transpose((0, 2, 3, 1))
+
+ if self.safety_checker is not None:
+ safety_checker_input = self.feature_extractor(
+ self.numpy_to_pil(image), return_tensors="np"
+ ).pixel_values.astype(image.dtype)
+ # safety_checker does not support batched inputs yet
+ images, has_nsfw_concept = [], []
+ for i in range(image.shape[0]):
+ image_i, has_nsfw_concept_i = self.safety_checker(
+ clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+ )
+ images.append(image_i)
+ has_nsfw_concept.append(has_nsfw_concept_i[0])
+ image = np.concatenate(images)
+ else:
+ has_nsfw_concept = None
+
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
new file mode 100644
index 0000000..6c226dd
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -0,0 +1,464 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from transformers import CLIPFeatureExtractor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+NUM_UNET_INPUT_CHANNELS = 9
+NUM_LATENT_CHANNELS = 4
+
+
+def prepare_mask_and_masked_image(image, mask, latents_shape):
+ image = np.array(image.convert("RGB").resize((latents_shape[1] * 8, latents_shape[0] * 8)))
+ image = image[None].transpose(0, 3, 1, 2)
+ image = image.astype(np.float32) / 127.5 - 1.0
+
+ image_mask = np.array(mask.convert("L").resize((latents_shape[1] * 8, latents_shape[0] * 8)))
+ masked_image = image * (image_mask < 127.5)
+
+ mask = mask.resize((latents_shape[1], latents_shape[0]), PIL.Image.NEAREST)
+ mask = np.array(mask.convert("L"))
+ mask = mask.astype(np.float32) / 255.0
+ mask = mask[None, None]
+ mask[mask < 0.5] = 0
+ mask[mask >= 0.5] = 1
+
+ return mask, masked_image
+
+
+class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
+ r"""
+ Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPFeatureExtractor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+ vae_encoder: OnnxRuntimeModel
+ vae_decoder: OnnxRuntimeModel
+ text_encoder: OnnxRuntimeModel
+ tokenizer: CLIPTokenizer
+ unet: OnnxRuntimeModel
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+ safety_checker: OnnxRuntimeModel
+ feature_extractor: CLIPFeatureExtractor
+
+ def __init__(
+ self,
+ vae_encoder: OnnxRuntimeModel,
+ vae_decoder: OnnxRuntimeModel,
+ text_encoder: OnnxRuntimeModel,
+ tokenizer: CLIPTokenizer,
+ unet: OnnxRuntimeModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ safety_checker: OnnxRuntimeModel,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
+ super().__init__()
+ logger.info("`OnnxStableDiffusionInpaintPipeline` is experimental and will very likely change in the future.")
+
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["steps_offset"] = 1
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+ )
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["clip_sample"] = False
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if safety_checker is None:
+ logger.warning(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ self.register_modules(
+ vae_encoder=vae_encoder,
+ vae_decoder=vae_decoder,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
+ def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `list(int)`):
+ prompt to be encoded
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ """
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+ # get prompt text embeddings
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ return_tensors="np",
+ )
+ text_input_ids = text_inputs.input_ids
+
+ if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+ removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+ text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+ text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt] * batch_size
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ max_length = text_input_ids.shape[-1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="np",
+ )
+ uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+ uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+
+ return text_embeddings
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]],
+ image: PIL.Image.Image,
+ mask_image: PIL.Image.Image,
+ height: int = 512,
+ width: int = 512,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[np.random.RandomState] = None,
+ latents: Optional[np.ndarray] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`):
+ The prompt or prompts to guide the image generation.
+ image (`PIL.Image.Image`):
+ `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+ be masked out with `mask_image` and repainted according to `prompt`.
+ mask_image (`PIL.Image.Image`):
+ `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+ repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+ to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+ instead of 3, so the expected shape would be `(B, H, W, 1)`.
+ height (`int`, *optional*, defaults to 512):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to 512):
+ The width in pixels of the generated image.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`np.random.RandomState`, *optional*):
+ A np.random.RandomState to make generation deterministic.
+ latents (`np.ndarray`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ if isinstance(prompt, str):
+ batch_size = 1
+ elif isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ if generator is None:
+ generator = np.random
+
+ # set timesteps
+ self.scheduler.set_timesteps(num_inference_steps)
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ text_embeddings = self._encode_prompt(
+ prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+ )
+
+ num_channels_latents = NUM_LATENT_CHANNELS
+ latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
+ latents_dtype = text_embeddings.dtype
+ if latents is None:
+ latents = generator.randn(*latents_shape).astype(latents_dtype)
+ else:
+ if latents.shape != latents_shape:
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+ # prepare mask and masked_image
+ mask, masked_image = prepare_mask_and_masked_image(image, mask_image, latents_shape[-2:])
+ mask = mask.astype(latents.dtype)
+ masked_image = masked_image.astype(latents.dtype)
+
+ masked_image_latents = self.vae_encoder(sample=masked_image)[0]
+ masked_image_latents = 0.18215 * masked_image_latents
+
+ # duplicate mask and masked_image_latents for each generation per prompt
+ mask = mask.repeat(batch_size * num_images_per_prompt, 0)
+ masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 0)
+
+ mask = np.concatenate([mask] * 2) if do_classifier_free_guidance else mask
+ masked_image_latents = (
+ np.concatenate([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+ )
+
+ num_channels_mask = mask.shape[1]
+ num_channels_masked_image = masked_image_latents.shape[1]
+
+ unet_input_channels = NUM_UNET_INPUT_CHANNELS
+ if num_channels_latents + num_channels_mask + num_channels_masked_image != unet_input_channels:
+ raise ValueError(
+ "Incorrect configuration settings! The config of `pipeline.unet` expects"
+ f" {unet_input_channels} but received `num_channels_latents`: {num_channels_latents} +"
+ f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+ f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+ " `pipeline.unet` or your `mask_image` or `image` input."
+ )
+
+ # set timesteps
+ self.scheduler.set_timesteps(num_inference_steps)
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * np.float(self.scheduler.init_noise_sigma)
+
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ timestep_dtype = next(
+ (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+ )
+ timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+ for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+ # concat latents, mask, masked_image_latnets in the channel dimension
+ latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1)
+ latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+ latent_model_input = latent_model_input.cpu().numpy()
+
+ # predict the noise residual
+ timestep = np.array([t], dtype=timestep_dtype)
+ noise_pred = self.unet(
+ sample=latent_model_input, timestep=timestep, encoder_hidden_states=text_embeddings
+ )[0]
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, torch.from_numpy(latents), **extra_step_kwargs).prev_sample
+ latents = latents.numpy()
+
+ # call the callback, if provided
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ latents = 1 / 0.18215 * latents
+ # image = self.vae_decoder(latent_sample=latents)[0]
+ # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+ image = np.concatenate(
+ [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+ )
+
+ image = np.clip(image / 2 + 0.5, 0, 1)
+ image = image.transpose((0, 2, 3, 1))
+
+ if self.safety_checker is not None:
+ safety_checker_input = self.feature_extractor(
+ self.numpy_to_pil(image), return_tensors="np"
+ ).pixel_values.astype(image.dtype)
+ # safety_checker does not support batched inputs yet
+ images, has_nsfw_concept = [], []
+ for i in range(image.shape[0]):
+ image_i, has_nsfw_concept_i = self.safety_checker(
+ clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+ )
+ images.append(image_i)
+ has_nsfw_concept.append(has_nsfw_concept_i[0])
+ image = np.concatenate(images)
+ else:
+ has_nsfw_concept = None
+
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
new file mode 100644
index 0000000..5b608da
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -0,0 +1,522 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+
+# from ..diffusers.utils import is_accelerate_available
+from ...utils import is_accelerate_available
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import (
+ DDIMScheduler,
+ DPMSolverMultistepScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+)
+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+class StableDiffusionPipeline(DiffusionPipeline):
+ r"""
+ Pipeline for text-to-image generation using Stable Diffusion.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPFeatureExtractor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: Union[
+ DDIMScheduler,
+ PNDMScheduler,
+ LMSDiscreteScheduler,
+ EulerDiscreteScheduler,
+ EulerAncestralDiscreteScheduler,
+ DPMSolverMultistepScheduler,
+ ],
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
+ super().__init__()
+
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0",
+ deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["steps_offset"] = 1
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+ )
+ deprecate("clip_sample not set", "1.0.0",
+ deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["clip_sample"] = False
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if safety_checker is None:
+ logger.warn(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+
+ def enable_xformers_memory_efficient_attention(self):
+ r"""
+ Enable memory efficient attention as implemented in xformers.
+
+ When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+ time. Speed up at training time is not guaranteed.
+
+ Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+ is used.
+ """
+ self.unet.set_use_memory_efficient_attention_xformers(True)
+
+ def disable_xformers_memory_efficient_attention(self):
+ r"""
+ Disable memory efficient attention as implemented in xformers.
+ """
+ self.unet.set_use_memory_efficient_attention_xformers(False)
+
+ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+ r"""
+ Enable sliced attention computation.
+
+ When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+ in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+ Args:
+ slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+ When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+ a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+ `attention_head_dim` must be a multiple of `slice_size`.
+ """
+ if slice_size == "auto":
+ # half the attention head size is usually a good trade-off between
+ # speed and memory
+ slice_size = self.unet.config.attention_head_dim // 2
+ self.unet.set_attention_slice(slice_size)
+
+ def disable_attention_slicing(self):
+ r"""
+ Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+ back to computing attention in one step.
+ """
+ # set slice_size = `None` to disable `attention slicing`
+ self.enable_attention_slicing(None)
+
+ def enable_sequential_cpu_offload(self):
+ r"""
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+ text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+ """
+ if is_accelerate_available():
+ from accelerate import cpu_offload
+ else:
+ raise ImportError(
+ "Please install accelerate via `pip install accelerate`")
+
+ device = torch.device("cuda")
+
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+ if cpu_offloaded_model is not None:
+ cpu_offload(cpu_offloaded_model, device)
+
+ @property
+ def _execution_device(self):
+ r"""
+ Returns the device on which the pipeline's models will be executed. After calling
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+ hooks.
+ """
+ if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+ return self.device
+ for module in self.unet.modules():
+ if (
+ hasattr(module, "_hf_hook")
+ and hasattr(module._hf_hook, "execution_device")
+ and module._hf_hook.execution_device is not None
+ ):
+ return torch.device(module._hf_hook.execution_device)
+ return self.device
+
+ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `list(int)`):
+ prompt to be encoded
+ device: (`torch.device`):
+ torch device
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ """
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+
+ if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+ removed_text = self.tokenizer.batch_decode(
+ text_input_ids[:, self.tokenizer.model_max_length:])
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:,
+ : self.tokenizer.model_max_length]
+ text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ bs_embed, seq_len, _ = text_embeddings.shape
+ text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+ text_embeddings = text_embeddings.view(
+ bs_embed * num_images_per_prompt, seq_len, -1)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt]
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ max_length = text_input_ids.shape[-1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ uncond_embeddings = self.text_encoder(
+ uncond_input.input_ids.to(device))[0]
+
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = uncond_embeddings.shape[1]
+ uncond_embeddings = uncond_embeddings.repeat(
+ 1, num_images_per_prompt, 1)
+ uncond_embeddings = uncond_embeddings.view(
+ batch_size * num_images_per_prompt, seq_len, -1)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+ return text_embeddings
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]],
+ height: int = 512,
+ width: int = 512,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[torch.Generator] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[
+ int, int, torch.FloatTensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`):
+ The prompt or prompts to guide the image generation.
+ height (`int`, *optional*, defaults to 512):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to 512):
+ The width in pixels of the generated image.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ if isinstance(prompt, str):
+ batch_size = 1
+ elif isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ raise ValueError(
+ f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(
+ f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(
+ callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ device = self._execution_device
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ text_embeddings = self._encode_prompt(
+ prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+ )
+
+ # Unlike in other pipelines, latents need to be generated in the target device
+ # for 1-to-1 results reproducibility with the CompVis implementation.
+ # However this currently doesn't work in `mps`.
+
+ # get the initial random noise unless the user supplied it
+ latents_shape = (batch_size * num_images_per_prompt,
+ self.unet.in_channels, height // 8, width // 8)
+ latents_dtype = text_embeddings.dtype
+ if latents is None:
+ if device.type == "mps":
+ # randn does not work reproducibly on mps
+ latents = torch.randn(
+ latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(device)
+ else:
+ latents = torch.randn(
+ latents_shape, generator=generator, device=device, dtype=latents_dtype)
+ else:
+ if latents.shape != latents_shape:
+ raise ValueError(
+ f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+ latents = latents.to(device)
+
+ # set timesteps and move to the correct device
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
+ timesteps_tensor = self.scheduler.timesteps
+
+ repeat_timesteps = kwargs.get("repeat_timesteps", None)
+ print(repeat_timesteps)
+ if repeat_timesteps is not None:
+ print("adding repeat timesteps")
+ repeat_times = kwargs.get("repeat_times", 1)
+ repeat_timesteps_tensor = torch.tensor(repeat_timesteps, device=device,
+ dtype=timesteps_tensor.dtype).repeat(repeat_times)
+ timesteps_tensor = torch.cat(
+ [timesteps_tensor, repeat_timesteps_tensor])
+ print("timesteps for now: ", timesteps_tensor)
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+ accepts_eta = "eta" in set(inspect.signature(
+ self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(
+ inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+
+ for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat(
+ [latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(
+ latent_model_input, t)
+
+ # predict the noise residual
+ noise_pred = self.unet(latent_model_input, t,
+ encoder_hidden_states=text_embeddings).sample
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * \
+ (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(
+ noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+ # call the callback, if provided
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ latents = 1 / 0.18215 * latents
+ image = self.vae.decode(latents).sample
+
+ image = (image / 2 + 0.5).clamp(0, 1)
+
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+ if self.safety_checker is not None:
+ print("has safety_checker")
+ safety_checker_input = self.feature_extractor(
+ self.numpy_to_pil(image), return_tensors="pt").to(device)
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(
+ text_embeddings.dtype)
+ )
+ else:
+ has_nsfw_concept = None
+
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
new file mode 100644
index 0000000..a3ef8d1
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -0,0 +1,544 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from ...utils import is_accelerate_available
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import (
+ DDIMScheduler,
+ EulerAncestralDiscreteScheduler,
+ EulerDiscreteScheduler,
+ LMSDiscreteScheduler,
+ PNDMScheduler,
+)
+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+def preprocess(image):
+ w, h = image.size
+ w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
+ image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+ image = np.array(image).astype(np.float32) / 255.0
+ image = image[None].transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image)
+ return 2.0 * image - 1.0
+
+
+class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
+ r"""
+ Pipeline for text-guided image to image generation using Stable Diffusion.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPFeatureExtractor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: Union[
+ DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler, EulerDiscreteScheduler, EulerAncestralDiscreteScheduler
+ ],
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
+ super().__init__()
+
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0",
+ deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["steps_offset"] = 1
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+ )
+ deprecate("clip_sample not set", "1.0.0",
+ deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["clip_sample"] = False
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if safety_checker is None:
+ logger.warn(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+
+ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+ r"""
+ Enable sliced attention computation.
+
+ When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+ in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+ Args:
+ slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+ When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+ a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+ `attention_head_dim` must be a multiple of `slice_size`.
+ """
+ if slice_size == "auto":
+ # half the attention head size is usually a good trade-off between
+ # speed and memory
+ slice_size = self.unet.config.attention_head_dim // 2
+ self.unet.set_attention_slice(slice_size)
+
+ def disable_attention_slicing(self):
+ r"""
+ Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+ back to computing attention in one step.
+ """
+ # set slice_size = `None` to disable `set_attention_slice`
+ self.enable_attention_slicing(None)
+
+ def enable_sequential_cpu_offload(self):
+ r"""
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+ text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+ """
+ if is_accelerate_available():
+ from accelerate import cpu_offload
+ else:
+ raise ImportError(
+ "Please install accelerate via `pip install accelerate`")
+
+ device = torch.device("cuda")
+
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+ if cpu_offloaded_model is not None:
+ cpu_offload(cpu_offloaded_model, device)
+
+ @property
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+ def _execution_device(self):
+ r"""
+ Returns the device on which the pipeline's models will be executed. After calling
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+ hooks.
+ """
+ if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+ return self.device
+ for module in self.unet.modules():
+ if (
+ hasattr(module, "_hf_hook")
+ and hasattr(module._hf_hook, "execution_device")
+ and module._hf_hook.execution_device is not None
+ ):
+ return torch.device(module._hf_hook.execution_device)
+ return self.device
+
+ def enable_xformers_memory_efficient_attention(self):
+ r"""
+ Enable memory efficient attention as implemented in xformers.
+
+ When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+ time. Speed up at training time is not guaranteed.
+
+ Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+ is used.
+ """
+ self.unet.set_use_memory_efficient_attention_xformers(True)
+
+ def disable_xformers_memory_efficient_attention(self):
+ r"""
+ Disable memory efficient attention as implemented in xformers.
+ """
+ self.unet.set_use_memory_efficient_attention_xformers(False)
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `list(int)`):
+ prompt to be encoded
+ device: (`torch.device`):
+ torch device
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ """
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+
+ if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+ removed_text = self.tokenizer.batch_decode(
+ text_input_ids[:, self.tokenizer.model_max_length:])
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:,
+ : self.tokenizer.model_max_length]
+ text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ bs_embed, seq_len, _ = text_embeddings.shape
+ text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+ text_embeddings = text_embeddings.view(
+ bs_embed * num_images_per_prompt, seq_len, -1)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt]
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ max_length = text_input_ids.shape[-1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ uncond_embeddings = self.text_encoder(
+ uncond_input.input_ids.to(device))[0]
+
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = uncond_embeddings.shape[1]
+ uncond_embeddings = uncond_embeddings.repeat(
+ 1, num_images_per_prompt, 1)
+ uncond_embeddings = uncond_embeddings.view(
+ batch_size * num_images_per_prompt, seq_len, -1)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+ return text_embeddings
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]],
+ init_image: Union[torch.FloatTensor, PIL.Image.Image],
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[torch.Generator] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[
+ int, int, torch.FloatTensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`):
+ The prompt or prompts to guide the image generation.
+ init_image (`torch.FloatTensor` or `PIL.Image.Image`):
+ `Image`, or tensor representing an image batch, that will be used as the starting point for the
+ process.
+ strength (`float`, *optional*, defaults to 0.8):
+ Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+ `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+ number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+ noise will be maximum and the denoising process will run for the full number of iterations specified in
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference. This parameter will be modulated by `strength`.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ if isinstance(prompt, str):
+ batch_size = 1
+ elif isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ raise ValueError(
+ f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if strength < 0 or strength > 1:
+ raise ValueError(
+ f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(
+ callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ device = self._execution_device
+
+ # set timesteps
+ self.scheduler.set_timesteps(num_inference_steps)
+
+ if isinstance(init_image, PIL.Image.Image):
+ init_image = preprocess(init_image)
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ text_embeddings = self._encode_prompt(
+ prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+ )
+
+ # encode the init image into latents and scale the latents
+ latents_dtype = text_embeddings.dtype
+ init_image = init_image.to(device=device, dtype=latents_dtype)
+ init_latent_dist = self.vae.encode(init_image).latent_dist
+ init_latents = init_latent_dist.sample(generator=generator)
+ init_latents = 0.18215 * init_latents
+
+ if isinstance(prompt, str):
+ prompt = [prompt]
+ if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0:
+ # expand init_latents for batch_size
+ deprecation_message = (
+ f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+ " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
+ " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+ " your script to pass as many init images as text prompts to suppress this warning."
+ )
+ deprecate("len(prompt) != len(init_image)", "1.0.0",
+ deprecation_message, standard_warn=False)
+ additional_image_per_prompt = len(prompt) // init_latents.shape[0]
+ init_latents = torch.cat(
+ [init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
+ elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
+ raise ValueError(
+ f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
+ )
+ else:
+ init_latents = torch.cat(
+ [init_latents] * num_images_per_prompt, dim=0)
+
+ # get the original timestep using init_timestep
+ offset = self.scheduler.config.get("steps_offset", 0)
+ init_timestep = int(num_inference_steps * strength) + offset
+ init_timestep = min(init_timestep, num_inference_steps)
+
+ timesteps = self.scheduler.timesteps[-init_timestep]
+ timesteps = torch.tensor(
+ [timesteps] * batch_size * num_images_per_prompt, device=device)
+
+ # add noise to latents using the timesteps
+ noise = torch.randn(init_latents.shape, generator=generator,
+ device=device, dtype=latents_dtype)
+ init_latents = self.scheduler.add_noise(init_latents, noise, timesteps)
+
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+ accepts_eta = "eta" in set(inspect.signature(
+ self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(
+ inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+
+ latents = init_latents
+
+ t_start = max(num_inference_steps - init_timestep + offset, 0)
+
+ # Some schedulers like PNDM have timesteps as arrays
+ # It's more optimized to move all timesteps to correct device beforehand
+ timesteps = self.scheduler.timesteps[t_start:].to(device)
+
+ for i, t in enumerate(self.progress_bar(timesteps)):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat(
+ [latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(
+ latent_model_input, t)
+
+ # predict the noise residual
+ noise_pred = self.unet(latent_model_input, t,
+ encoder_hidden_states=text_embeddings).sample
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * \
+ (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(
+ noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+ # call the callback, if provided
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ latents = 1 / 0.18215 * latents
+ image = self.vae.decode(latents).sample
+
+ image = (image / 2 + 0.5).clamp(0, 1)
+ image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+ if self.safety_checker is not None:
+ safety_checker_input = self.feature_extractor(
+ self.numpy_to_pil(image), return_tensors="pt").to(device)
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(
+ text_embeddings.dtype)
+ )
+ else:
+ has_nsfw_concept = None
+
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
new file mode 100644
index 0000000..6adcf24
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -0,0 +1,579 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from ...utils import is_accelerate_available
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+def prepare_mask_and_masked_image(image, mask):
+ image = np.array(image.convert("RGB"))
+ image = image[None].transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+ mask = np.array(mask.convert("L"))
+ mask = mask.astype(np.float32) / 255.0
+ mask = mask[None, None]
+ mask[mask < 0.5] = 0
+ mask[mask >= 0.5] = 1
+ mask = torch.from_numpy(mask)
+
+ masked_image = image * (mask < 0.5)
+
+ return mask, masked_image
+
+
+class StableDiffusionInpaintPipeline(DiffusionPipeline):
+ r"""
+ Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPFeatureExtractor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
+ super().__init__()
+
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0",
+ deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["steps_offset"] = 1
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration"
+ " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
+ " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
+ " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
+ " Hub, it would be very nice if you could open a Pull request for the"
+ " `scheduler/scheduler_config.json` file"
+ )
+ deprecate("skip_prk_steps not set", "1.0.0",
+ deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["skip_prk_steps"] = True
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if safety_checker is None:
+ logger.warn(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+
+ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+ r"""
+ Enable sliced attention computation.
+
+ When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+ in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+ Args:
+ slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+ When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+ a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+ `attention_head_dim` must be a multiple of `slice_size`.
+ """
+ if slice_size == "auto":
+ # half the attention head size is usually a good trade-off between
+ # speed and memory
+ slice_size = self.unet.config.attention_head_dim // 2
+ self.unet.set_attention_slice(slice_size)
+
+ def disable_attention_slicing(self):
+ r"""
+ Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+ back to computing attention in one step.
+ """
+ # set slice_size = `None` to disable `attention slicing`
+ self.enable_attention_slicing(None)
+
+ def enable_sequential_cpu_offload(self):
+ r"""
+ Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+ text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+ `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+ """
+ if is_accelerate_available():
+ from accelerate import cpu_offload
+ else:
+ raise ImportError(
+ "Please install accelerate via `pip install accelerate`")
+
+ device = torch.device("cuda")
+
+ for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+ if cpu_offloaded_model is not None:
+ cpu_offload(cpu_offloaded_model, device)
+
+ @property
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+ def _execution_device(self):
+ r"""
+ Returns the device on which the pipeline's models will be executed. After calling
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+ hooks.
+ """
+ if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+ return self.device
+ for module in self.unet.modules():
+ if (
+ hasattr(module, "_hf_hook")
+ and hasattr(module._hf_hook, "execution_device")
+ and module._hf_hook.execution_device is not None
+ ):
+ return torch.device(module._hf_hook.execution_device)
+ return self.device
+
+ def enable_xformers_memory_efficient_attention(self):
+ r"""
+ Enable memory efficient attention as implemented in xformers.
+
+ When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+ time. Speed up at training time is not guaranteed.
+
+ Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+ is used.
+ """
+ self.unet.set_use_memory_efficient_attention_xformers(True)
+
+ def disable_xformers_memory_efficient_attention(self):
+ r"""
+ Disable memory efficient attention as implemented in xformers.
+ """
+ self.unet.set_use_memory_efficient_attention_xformers(False)
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `list(int)`):
+ prompt to be encoded
+ device: (`torch.device`):
+ torch device
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ """
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+
+ if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+ removed_text = self.tokenizer.batch_decode(
+ text_input_ids[:, self.tokenizer.model_max_length:])
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:,
+ : self.tokenizer.model_max_length]
+ text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ bs_embed, seq_len, _ = text_embeddings.shape
+ text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+ text_embeddings = text_embeddings.view(
+ bs_embed * num_images_per_prompt, seq_len, -1)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt]
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ max_length = text_input_ids.shape[-1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ uncond_embeddings = self.text_encoder(
+ uncond_input.input_ids.to(device))[0]
+
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = uncond_embeddings.shape[1]
+ uncond_embeddings = uncond_embeddings.repeat(
+ 1, num_images_per_prompt, 1)
+ uncond_embeddings = uncond_embeddings.view(
+ batch_size * num_images_per_prompt, seq_len, -1)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+ return text_embeddings
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]],
+ image: Union[torch.FloatTensor, PIL.Image.Image],
+ mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+ height: int = 512,
+ width: int = 512,
+ num_inference_steps: int = 50,
+ guidance_scale: float = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: float = 0.0,
+ generator: Optional[torch.Generator] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[
+ int, int, torch.FloatTensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`):
+ The prompt or prompts to guide the image generation.
+ image (`PIL.Image.Image`):
+ `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+ be masked out with `mask_image` and repainted according to `prompt`.
+ mask_image (`PIL.Image.Image`):
+ `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+ repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+ to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+ instead of 3, so the expected shape would be `(B, H, W, 1)`.
+ height (`int`, *optional*, defaults to 512):
+ The height in pixels of the generated image.
+ width (`int`, *optional*, defaults to 512):
+ The width in pixels of the generated image.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ latents (`torch.FloatTensor`, *optional*):
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+ tensor will ge generated by sampling using the supplied random `generator`.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ self.vae.eval()
+ self.unet.eval()
+ self.text_encoder.eval()
+ if isinstance(prompt, str):
+ batch_size = 1
+ elif isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ raise ValueError(
+ f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if height % 8 != 0 or width % 8 != 0:
+ raise ValueError(
+ f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(
+ callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ device = self._execution_device
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+ text_embeddings = self._encode_prompt(
+ prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+ )
+
+ # get the initial random noise unless the user supplied it
+ # Unlike in other pipelines, latents need to be generated in the target device
+ # for 1-to-1 results reproducibility with the CompVis implementation.
+ # However this currently doesn't work in `mps`.
+ num_channels_latents = self.vae.config.latent_channels
+ latents_shape = (batch_size * num_images_per_prompt,
+ num_channels_latents, height // 8, width // 8)
+ latents_dtype = text_embeddings.dtype
+ if latents is None:
+ if device.type == "mps":
+ # randn does not exist on mps
+ latents = torch.randn(
+ latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(device)
+ else:
+ latents = torch.randn(
+ latents_shape, generator=generator, device=device, dtype=latents_dtype)
+ else:
+ if latents.shape != latents_shape:
+ raise ValueError(
+ f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+ latents = latents.to(device)
+
+ # prepare mask and masked_image
+ mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+
+ # resize the mask to latents shape as we concatenate the mask to the latents
+ # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+ # and half precision
+ mask = torch.nn.functional.interpolate(
+ mask, size=(height // 8, width // 8))
+ mask = mask.to(device=device, dtype=text_embeddings.dtype)
+
+ masked_image = masked_image.to(
+ device=device, dtype=text_embeddings.dtype)
+
+ # encode the mask image into latents space so we can concatenate it to the latents
+ masked_image_latents = self.vae.encode(
+ masked_image).latent_dist.sample(generator=generator)
+ masked_image_latents = 0.18215 * masked_image_latents
+
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+ mask = mask.repeat(batch_size * num_images_per_prompt, 1, 1, 1)
+ masked_image_latents = masked_image_latents.repeat(
+ batch_size * num_images_per_prompt, 1, 1, 1)
+
+ mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+ masked_image_latents = (
+ torch.cat([masked_image_latents] *
+ 2) if do_classifier_free_guidance else masked_image_latents
+ )
+
+ # aligning device to prevent device errors when concating it with the latent model input
+ masked_image_latents = masked_image_latents.to(
+ device=device, dtype=text_embeddings.dtype)
+
+ num_channels_mask = mask.shape[1]
+ num_channels_masked_image = masked_image_latents.shape[1]
+
+ if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+ raise ValueError(
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+ f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+ f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+ " `pipeline.unet` or your `mask_image` or `image` input."
+ )
+
+ # set timesteps and move to the correct device
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
+ timesteps_tensor = self.scheduler.timesteps
+
+ # scale the initial noise by the standard deviation required by the scheduler
+ latents = latents * self.scheduler.init_noise_sigma
+
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+ accepts_eta = "eta" in set(inspect.signature(
+ self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(
+ inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+
+ for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat(
+ [latents] * 2) if do_classifier_free_guidance else latents
+
+ # concat latents, mask, masked_image_latents in the channel dimension
+ latent_model_input = torch.cat(
+ [latent_model_input, mask, masked_image_latents], dim=1)
+
+ latent_model_input = self.scheduler.scale_model_input(
+ latent_model_input, t)
+
+ # predict the noise residual
+ noise_pred = self.unet(latent_model_input, t,
+ encoder_hidden_states=text_embeddings).sample
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * \
+ (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(
+ noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+ # call the callback, if provided
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ latents = 1 / 0.18215 * latents
+ image = self.vae.decode(latents).sample
+
+ image = (image / 2 + 0.5).clamp(0, 1)
+
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+ if self.safety_checker is not None:
+ safety_checker_input = self.feature_extractor(
+ self.numpy_to_pil(image), return_tensors="pt").to(device)
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(
+ text_embeddings.dtype)
+ )
+ else:
+ has_nsfw_concept = None
+
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
new file mode 100644
index 0000000..89d40f7
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -0,0 +1,492 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from tqdm.auto import tqdm
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)
+
+
+def preprocess_image(image):
+ w, h = image.size
+ w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
+ image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+ image = np.array(image).astype(np.float32) / 255.0
+ image = image[None].transpose(0, 3, 1, 2)
+ image = torch.from_numpy(image)
+ return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask):
+ mask = mask.convert("L")
+ w, h = mask.size
+ w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
+ mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
+ mask = np.array(mask).astype(np.float32) / 255.0
+ mask = np.tile(mask, (4, 1, 1))
+ mask = mask[None].transpose(0, 1, 2, 3) # what does this step do?
+ mask = 1 - mask # repaint white, keep black
+ mask = torch.from_numpy(mask)
+ return mask
+
+
+class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
+ r"""
+ Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Args:
+ vae ([`AutoencoderKL`]):
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. Stable Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+ scheduler ([`SchedulerMixin`]):
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+ safety_checker ([`StableDiffusionSafetyChecker`]):
+ Classification module that estimates whether generated images could be considered offensive or harmful.
+ Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+ feature_extractor ([`CLIPFeatureExtractor`]):
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+ """
+
+ def __init__(
+ self,
+ vae: AutoencoderKL,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ unet: UNet2DConditionModel,
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+ safety_checker: StableDiffusionSafetyChecker,
+ feature_extractor: CLIPFeatureExtractor,
+ ):
+ super().__init__()
+ if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+ " file"
+ )
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["steps_offset"] = 1
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+ deprecation_message = (
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+ )
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+ new_config = dict(scheduler.config)
+ new_config["clip_sample"] = False
+ scheduler._internal_dict = FrozenDict(new_config)
+
+ if safety_checker is None:
+ logger.warn(
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ )
+
+ self.register_modules(
+ vae=vae,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ unet=unet,
+ scheduler=scheduler,
+ safety_checker=safety_checker,
+ feature_extractor=feature_extractor,
+ )
+
+ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+ r"""
+ Enable sliced attention computation.
+
+ When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+ in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+ Args:
+ slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+ When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+ a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+ `attention_head_dim` must be a multiple of `slice_size`.
+ """
+ if slice_size == "auto":
+ # half the attention head size is usually a good trade-off between
+ # speed and memory
+ slice_size = self.unet.config.attention_head_dim // 2
+ self.unet.set_attention_slice(slice_size)
+
+ def disable_attention_slicing(self):
+ r"""
+ Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+ back to computing attention in one step.
+ """
+ # set slice_size = `None` to disable `set_attention_slice`
+ self.enable_attention_slicing(None)
+
+ @property
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+ def _execution_device(self):
+ r"""
+ Returns the device on which the pipeline's models will be executed. After calling
+ `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+ hooks.
+ """
+ if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+ return self.device
+ for module in self.unet.modules():
+ if (
+ hasattr(module, "_hf_hook")
+ and hasattr(module._hf_hook, "execution_device")
+ and module._hf_hook.execution_device is not None
+ ):
+ return torch.device(module._hf_hook.execution_device)
+ return self.device
+
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+ r"""
+ Encodes the prompt into text encoder hidden states.
+
+ Args:
+ prompt (`str` or `list(int)`):
+ prompt to be encoded
+ device: (`torch.device`):
+ torch device
+ num_images_per_prompt (`int`):
+ number of images that should be generated per prompt
+ do_classifier_free_guidance (`bool`):
+ whether to use classifier free guidance or not
+ negative_prompt (`str` or `List[str]`):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ """
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+
+ if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+ removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+ text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
+ bs_embed, seq_len, _ = text_embeddings.shape
+ text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+ text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+ # get unconditional embeddings for classifier free guidance
+ if do_classifier_free_guidance:
+ uncond_tokens: List[str]
+ if negative_prompt is None:
+ uncond_tokens = [""] * batch_size
+ elif type(prompt) is not type(negative_prompt):
+ raise TypeError(
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+ f" {type(prompt)}."
+ )
+ elif isinstance(negative_prompt, str):
+ uncond_tokens = [negative_prompt]
+ elif batch_size != len(negative_prompt):
+ raise ValueError(
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+ " the batch size of `prompt`."
+ )
+ else:
+ uncond_tokens = negative_prompt
+
+ max_length = text_input_ids.shape[-1]
+ uncond_input = self.tokenizer(
+ uncond_tokens,
+ padding="max_length",
+ max_length=max_length,
+ truncation=True,
+ return_tensors="pt",
+ )
+ uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+ seq_len = uncond_embeddings.shape[1]
+ uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+ uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+ return text_embeddings
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]],
+ init_image: Union[torch.FloatTensor, PIL.Image.Image],
+ mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+ strength: float = 0.8,
+ num_inference_steps: Optional[int] = 50,
+ guidance_scale: Optional[float] = 7.5,
+ negative_prompt: Optional[Union[str, List[str]]] = None,
+ num_images_per_prompt: Optional[int] = 1,
+ eta: Optional[float] = 0.0,
+ generator: Optional[torch.Generator] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ **kwargs,
+ ):
+ r"""
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`):
+ The prompt or prompts to guide the image generation.
+ init_image (`torch.FloatTensor` or `PIL.Image.Image`):
+ `Image`, or tensor representing an image batch, that will be used as the starting point for the
+ process. This is the image whose masked region will be inpainted.
+ mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+ `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
+ replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+ PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+ contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+ strength (`float`, *optional*, defaults to 0.8):
+ Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+ is 1, the denoising process will be run on the masked area for the full number of iterations specified
+ in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
+ noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
+ the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+ guidance_scale (`float`, *optional*, defaults to 7.5):
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+ usually at the expense of lower image quality.
+ negative_prompt (`str` or `List[str]`, *optional*):
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+ if `guidance_scale` is less than `1`).
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ eta (`float`, *optional*, defaults to 0.0):
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+ [`schedulers.DDIMScheduler`], will be ignored for others.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+ plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+
+ Returns:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+ (nsfw) content, according to the `safety_checker`.
+ """
+ if isinstance(prompt, str):
+ batch_size = 1
+ elif isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ if strength < 0 or strength > 1:
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ device = self._execution_device
+
+ # set timesteps
+ self.scheduler.set_timesteps(num_inference_steps)
+
+ # preprocess image
+ if not isinstance(init_image, torch.FloatTensor):
+ init_image = preprocess_image(init_image)
+
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+ # corresponds to doing no classifier free guidance.
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ text_embeddings = self._encode_prompt(
+ prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+ )
+
+ # encode the init image into latents and scale the latents
+ latents_dtype = text_embeddings.dtype
+ init_image = init_image.to(device=self.device, dtype=latents_dtype)
+ init_latent_dist = self.vae.encode(init_image).latent_dist
+ init_latents = init_latent_dist.sample(generator=generator)
+ init_latents = 0.18215 * init_latents
+
+ # Expand init_latents for batch_size and num_images_per_prompt
+ init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
+ init_latents_orig = init_latents
+
+ # preprocess mask
+ if not isinstance(mask_image, torch.FloatTensor):
+ mask_image = preprocess_mask(mask_image)
+ mask_image = mask_image.to(device=self.device, dtype=latents_dtype)
+ mask = torch.cat([mask_image] * batch_size * num_images_per_prompt)
+
+ # check sizes
+ if not mask.shape == init_latents.shape:
+ raise ValueError("The mask and init_image should be the same size!")
+
+ # get the original timestep using init_timestep
+ offset = self.scheduler.config.get("steps_offset", 0)
+ init_timestep = int(num_inference_steps * strength) + offset
+ init_timestep = min(init_timestep, num_inference_steps)
+
+ timesteps = self.scheduler.timesteps[-init_timestep]
+ timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)
+
+ # add noise to latents using the timesteps
+ noise = torch.randn(init_latents.shape, generator=generator, device=self.device, dtype=latents_dtype)
+ init_latents = self.scheduler.add_noise(init_latents, noise, timesteps)
+
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+ # and should be between [0, 1]
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+
+ # check if the scheduler accepts generator
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+
+ latents = init_latents
+
+ t_start = max(num_inference_steps - init_timestep + offset, 0)
+
+ # Some schedulers like PNDM have timesteps as arrays
+ # It's more optimized to move all timesteps to correct device beforehand
+ timesteps = self.scheduler.timesteps[t_start:].to(self.device)
+
+ for i, t in tqdm(enumerate(timesteps)):
+ # expand the latents if we are doing classifier free guidance
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+ # predict the noise residual
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+ # perform guidance
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+ # masking
+ init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+
+ latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+ # call the callback, if provided
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, latents)
+
+ latents = 1 / 0.18215 * latents
+ image = self.vae.decode(latents).sample
+
+ image = (image / 2 + 0.5).clamp(0, 1)
+ image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+ if self.safety_checker is not None:
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+ self.device
+ )
+ image, has_nsfw_concept = self.safety_checker(
+ images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+ )
+ else:
+ has_nsfw_concept = None
+
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image, has_nsfw_concept)
+
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/model/TextGen/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/model/TextGen/diffusers/pipelines/stable_diffusion/safety_checker.py
new file mode 100644
index 0000000..0477c98
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stable_diffusion/safety_checker.py
@@ -0,0 +1,123 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from transformers import CLIPConfig, CLIPVisionModel, PreTrainedModel
+
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def cosine_distance(image_embeds, text_embeds):
+ normalized_image_embeds = nn.functional.normalize(image_embeds)
+ normalized_text_embeds = nn.functional.normalize(text_embeds)
+ return torch.mm(normalized_image_embeds, normalized_text_embeds.t())
+
+
+class StableDiffusionSafetyChecker(PreTrainedModel):
+ config_class = CLIPConfig
+
+ _no_split_modules = ["CLIPEncoderLayer"]
+
+ def __init__(self, config: CLIPConfig):
+ super().__init__(config)
+
+ self.vision_model = CLIPVisionModel(config.vision_config)
+ self.visual_projection = nn.Linear(config.vision_config.hidden_size, config.projection_dim, bias=False)
+
+ self.concept_embeds = nn.Parameter(torch.ones(17, config.projection_dim), requires_grad=False)
+ self.special_care_embeds = nn.Parameter(torch.ones(3, config.projection_dim), requires_grad=False)
+
+ self.concept_embeds_weights = nn.Parameter(torch.ones(17), requires_grad=False)
+ self.special_care_embeds_weights = nn.Parameter(torch.ones(3), requires_grad=False)
+
+ @torch.no_grad()
+ def forward(self, clip_input, images):
+ pooled_output = self.vision_model(clip_input)[1] # pooled_output
+ image_embeds = self.visual_projection(pooled_output)
+
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+ special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).cpu().float().numpy()
+ cos_dist = cosine_distance(image_embeds, self.concept_embeds).cpu().float().numpy()
+
+ result = []
+ batch_size = image_embeds.shape[0]
+ for i in range(batch_size):
+ result_img = {"special_scores": {}, "special_care": [], "concept_scores": {}, "bad_concepts": []}
+
+ # increase this value to create a stronger `nfsw` filter
+ # at the cost of increasing the possibility of filtering benign images
+ adjustment = 0.0
+
+ for concept_idx in range(len(special_cos_dist[0])):
+ concept_cos = special_cos_dist[i][concept_idx]
+ concept_threshold = self.special_care_embeds_weights[concept_idx].item()
+ result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
+ if result_img["special_scores"][concept_idx] > 0:
+ result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
+ adjustment = 0.01
+
+ for concept_idx in range(len(cos_dist[0])):
+ concept_cos = cos_dist[i][concept_idx]
+ concept_threshold = self.concept_embeds_weights[concept_idx].item()
+ result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
+ if result_img["concept_scores"][concept_idx] > 0:
+ result_img["bad_concepts"].append(concept_idx)
+
+ result.append(result_img)
+
+ has_nsfw_concepts = [len(res["bad_concepts"]) > 0 for res in result]
+
+ for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+ if has_nsfw_concept:
+ images[idx] = np.zeros(images[idx].shape) # black image
+
+ if any(has_nsfw_concepts):
+ logger.warning(
+ "Potential NSFW content was detected in one or more images. A black image will be returned instead."
+ " Try again with a different prompt and/or seed."
+ )
+
+ return images, has_nsfw_concepts
+
+ @torch.no_grad()
+ def forward_onnx(self, clip_input: torch.FloatTensor, images: torch.FloatTensor):
+ pooled_output = self.vision_model(clip_input)[1] # pooled_output
+ image_embeds = self.visual_projection(pooled_output)
+
+ special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
+ cos_dist = cosine_distance(image_embeds, self.concept_embeds)
+
+ # increase this value to create a stronger `nsfw` filter
+ # at the cost of increasing the possibility of filtering benign images
+ adjustment = 0.0
+
+ special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment
+ # special_scores = special_scores.round(decimals=3)
+ special_care = torch.any(special_scores > 0, dim=1)
+ special_adjustment = special_care * 0.01
+ special_adjustment = special_adjustment.unsqueeze(1).expand(-1, cos_dist.shape[1])
+
+ concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
+ # concept_scores = concept_scores.round(decimals=3)
+ has_nsfw_concepts = torch.any(concept_scores > 0, dim=1)
+
+ images[has_nsfw_concepts] = 0.0 # black image
+
+ return images, has_nsfw_concepts
diff --git a/src/model/TextGen/diffusers/pipelines/stable_diffusion/safety_checker_flax.py b/src/model/TextGen/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
new file mode 100644
index 0000000..e1f669d
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
@@ -0,0 +1,112 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import jax
+import jax.numpy as jnp
+from flax import linen as nn
+from flax.core.frozen_dict import FrozenDict
+from transformers import CLIPConfig, FlaxPreTrainedModel
+from transformers.models.clip.modeling_flax_clip import FlaxCLIPVisionModule
+
+
+def jax_cosine_distance(emb_1, emb_2, eps=1e-12):
+ norm_emb_1 = jnp.divide(emb_1.T, jnp.clip(jnp.linalg.norm(emb_1, axis=1), a_min=eps)).T
+ norm_emb_2 = jnp.divide(emb_2.T, jnp.clip(jnp.linalg.norm(emb_2, axis=1), a_min=eps)).T
+ return jnp.matmul(norm_emb_1, norm_emb_2.T)
+
+
+class FlaxStableDiffusionSafetyCheckerModule(nn.Module):
+ config: CLIPConfig
+ dtype: jnp.dtype = jnp.float32
+
+ def setup(self):
+ self.vision_model = FlaxCLIPVisionModule(self.config.vision_config)
+ self.visual_projection = nn.Dense(self.config.projection_dim, use_bias=False, dtype=self.dtype)
+
+ self.concept_embeds = self.param("concept_embeds", jax.nn.initializers.ones, (17, self.config.projection_dim))
+ self.special_care_embeds = self.param(
+ "special_care_embeds", jax.nn.initializers.ones, (3, self.config.projection_dim)
+ )
+
+ self.concept_embeds_weights = self.param("concept_embeds_weights", jax.nn.initializers.ones, (17,))
+ self.special_care_embeds_weights = self.param("special_care_embeds_weights", jax.nn.initializers.ones, (3,))
+
+ def __call__(self, clip_input):
+ pooled_output = self.vision_model(clip_input)[1]
+ image_embeds = self.visual_projection(pooled_output)
+
+ special_cos_dist = jax_cosine_distance(image_embeds, self.special_care_embeds)
+ cos_dist = jax_cosine_distance(image_embeds, self.concept_embeds)
+
+ # increase this value to create a stronger `nfsw` filter
+ # at the cost of increasing the possibility of filtering benign image inputs
+ adjustment = 0.0
+
+ special_scores = special_cos_dist - self.special_care_embeds_weights[None, :] + adjustment
+ special_scores = jnp.round(special_scores, 3)
+ is_special_care = jnp.any(special_scores > 0, axis=1, keepdims=True)
+ # Use a lower threshold if an image has any special care concept
+ special_adjustment = is_special_care * 0.01
+
+ concept_scores = cos_dist - self.concept_embeds_weights[None, :] + special_adjustment
+ concept_scores = jnp.round(concept_scores, 3)
+ has_nsfw_concepts = jnp.any(concept_scores > 0, axis=1)
+
+ return has_nsfw_concepts
+
+
+class FlaxStableDiffusionSafetyChecker(FlaxPreTrainedModel):
+ config_class = CLIPConfig
+ main_input_name = "clip_input"
+ module_class = FlaxStableDiffusionSafetyCheckerModule
+
+ def __init__(
+ self,
+ config: CLIPConfig,
+ input_shape: Optional[Tuple] = None,
+ seed: int = 0,
+ dtype: jnp.dtype = jnp.float32,
+ _do_init: bool = True,
+ **kwargs,
+ ):
+ if input_shape is None:
+ input_shape = (1, 224, 224, 3)
+ module = self.module_class(config=config, dtype=dtype, **kwargs)
+ super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+ # init input tensor
+ clip_input = jax.random.normal(rng, input_shape)
+
+ params_rng, dropout_rng = jax.random.split(rng)
+ rngs = {"params": params_rng, "dropout": dropout_rng}
+
+ random_params = self.module.init(rngs, clip_input)["params"]
+
+ return random_params
+
+ def __call__(
+ self,
+ clip_input,
+ params: dict = None,
+ ):
+ clip_input = jnp.transpose(clip_input, (0, 2, 3, 1))
+
+ return self.module.apply(
+ {"params": params or self.params},
+ jnp.array(clip_input, dtype=jnp.float32),
+ rngs={},
+ )
diff --git a/src/model/TextGen/diffusers/pipelines/stochastic_karras_ve/__init__.py b/src/model/TextGen/diffusers/pipelines/stochastic_karras_ve/__init__.py
new file mode 100644
index 0000000..db25820
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stochastic_karras_ve/__init__.py
@@ -0,0 +1,2 @@
+# flake8: noqa
+from .pipeline_stochastic_karras_ve import KarrasVePipeline
diff --git a/src/model/TextGen/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py b/src/model/TextGen/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
new file mode 100644
index 0000000..739de8e
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
@@ -0,0 +1,129 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ...models import UNet2DModel
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...schedulers import KarrasVeScheduler
+
+
+class KarrasVePipeline(DiffusionPipeline):
+ r"""
+ Stochastic sampling from Karras et al. [1] tailored to the Variance-Expanding (VE) models [2]. Use Algorithm 2 and
+ the VE column of Table 1 from [1] for reference.
+
+ [1] Karras, Tero, et al. "Elucidating the Design Space of Diffusion-Based Generative Models."
+ https://arxiv.org/abs/2206.00364 [2] Song, Yang, et al. "Score-based generative modeling through stochastic
+ differential equations." https://arxiv.org/abs/2011.13456
+
+ Parameters:
+ unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+ scheduler ([`KarrasVeScheduler`]):
+ Scheduler for the diffusion process to be used in combination with `unet` to denoise the encoded image.
+ """
+
+ # add type hints for linting
+ unet: UNet2DModel
+ scheduler: KarrasVeScheduler
+
+ def __init__(self, unet: UNet2DModel, scheduler: KarrasVeScheduler):
+ super().__init__()
+ self.register_modules(unet=unet, scheduler=scheduler)
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ batch_size: int = 1,
+ num_inference_steps: int = 50,
+ generator: Optional[torch.Generator] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[Tuple, ImagePipelineOutput]:
+ r"""
+ Args:
+ batch_size (`int`, *optional*, defaults to 1):
+ The number of images to generate.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ num_inference_steps (`int`, *optional*, defaults to 50):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generate image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+ Returns:
+ [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+ `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+ generated images.
+ """
+
+ img_size = self.unet.config.sample_size
+ shape = (batch_size, 3, img_size, img_size)
+
+ model = self.unet
+
+ # sample x_0 ~ N(0, sigma_0^2 * I)
+ sample = torch.randn(*shape) * self.scheduler.init_noise_sigma
+ sample = sample.to(self.device)
+
+ self.scheduler.set_timesteps(num_inference_steps)
+
+ for t in self.progress_bar(self.scheduler.timesteps):
+ # here sigma_t == t_i from the paper
+ sigma = self.scheduler.schedule[t]
+ sigma_prev = self.scheduler.schedule[t - 1] if t > 0 else 0
+
+ # 1. Select temporarily increased noise level sigma_hat
+ # 2. Add new noise to move from sample_i to sample_hat
+ sample_hat, sigma_hat = self.scheduler.add_noise_to_input(sample, sigma, generator=generator)
+
+ # 3. Predict the noise residual given the noise magnitude `sigma_hat`
+ # The model inputs and output are adjusted by following eq. (213) in [1].
+ model_output = (sigma_hat / 2) * model((sample_hat + 1) / 2, sigma_hat / 2).sample
+
+ # 4. Evaluate dx/dt at sigma_hat
+ # 5. Take Euler step from sigma to sigma_prev
+ step_output = self.scheduler.step(model_output, sigma_hat, sigma_prev, sample_hat)
+
+ if sigma_prev != 0:
+ # 6. Apply 2nd order correction
+ # The model inputs and output are adjusted by following eq. (213) in [1].
+ model_output = (sigma_prev / 2) * model((step_output.prev_sample + 1) / 2, sigma_prev / 2).sample
+ step_output = self.scheduler.step_correct(
+ model_output,
+ sigma_hat,
+ sigma_prev,
+ sample_hat,
+ step_output.prev_sample,
+ step_output["derivative"],
+ )
+ sample = step_output.prev_sample
+
+ sample = (sample / 2 + 0.5).clamp(0, 1)
+ image = sample.cpu().permute(0, 2, 3, 1).numpy()
+ if output_type == "pil":
+ image = self.numpy_to_pil(sample)
+
+ if not return_dict:
+ return (image,)
+
+ return ImagePipelineOutput(images=image)
diff --git a/src/model/TextGen/diffusers/pipelines/vq_diffusion/__init__.py b/src/model/TextGen/diffusers/pipelines/vq_diffusion/__init__.py
new file mode 100644
index 0000000..edf6f57
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/vq_diffusion/__init__.py
@@ -0,0 +1 @@
+from .pipeline_vq_diffusion import VQDiffusionPipeline
diff --git a/src/model/TextGen/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py b/src/model/TextGen/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
new file mode 100644
index 0000000..d30b616
--- /dev/null
+++ b/src/model/TextGen/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
@@ -0,0 +1,265 @@
+# Copyright 2022 Microsoft and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+
+from ... import Transformer2DModel, VQModel
+from ...schedulers.scheduling_vq_diffusion import VQDiffusionScheduler
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+class VQDiffusionPipeline(DiffusionPipeline):
+ r"""
+ Pipeline for text-to-image generation using VQ Diffusion
+
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+ Args:
+ vqvae ([`VQModel`]):
+ Vector Quantized Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent
+ representations.
+ text_encoder ([`CLIPTextModel`]):
+ Frozen text-encoder. VQ Diffusion uses the text portion of
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+ the [clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) variant.
+ tokenizer (`CLIPTokenizer`):
+ Tokenizer of class
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+ transformer ([`Transformer2DModel`]):
+ Conditional transformer to denoise the encoded image latents.
+ scheduler ([`VQDiffusionScheduler`]):
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+ """
+
+ vqvae: VQModel
+ text_encoder: CLIPTextModel
+ tokenizer: CLIPTokenizer
+ transformer: Transformer2DModel
+ scheduler: VQDiffusionScheduler
+
+ def __init__(
+ self,
+ vqvae: VQModel,
+ text_encoder: CLIPTextModel,
+ tokenizer: CLIPTokenizer,
+ transformer: Transformer2DModel,
+ scheduler: VQDiffusionScheduler,
+ ):
+ super().__init__()
+
+ self.register_modules(
+ vqvae=vqvae,
+ transformer=transformer,
+ text_encoder=text_encoder,
+ tokenizer=tokenizer,
+ scheduler=scheduler,
+ )
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ prompt: Union[str, List[str]],
+ num_inference_steps: int = 100,
+ truncation_rate: float = 1.0,
+ num_images_per_prompt: int = 1,
+ generator: Optional[torch.Generator] = None,
+ latents: Optional[torch.FloatTensor] = None,
+ output_type: Optional[str] = "pil",
+ return_dict: bool = True,
+ callback: Optional[Callable[[
+ int, int, torch.FloatTensor], None]] = None,
+ callback_steps: Optional[int] = 1,
+ ) -> Union[ImagePipelineOutput, Tuple]:
+ """
+ Function invoked when calling the pipeline for generation.
+
+ Args:
+ prompt (`str` or `List[str]`):
+ The prompt or prompts to guide the image generation.
+ num_inference_steps (`int`, *optional*, defaults to 100):
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+ expense of slower inference.
+ truncation_rate (`float`, *optional*, defaults to 1.0 (equivalent to no truncation)):
+ Used to "truncate" the predicted classes for x_0 such that the cumulative probability for a pixel is at
+ most `truncation_rate`. The lowest probabilities that would increase the cumulative probability above
+ `truncation_rate` are set to zero.
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ The number of images to generate per prompt.
+ generator (`torch.Generator`, *optional*):
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+ deterministic.
+ latents (`torch.FloatTensor` of shape (batch), *optional*):
+ Pre-generated noisy latents to be used as inputs for image generation. Must be valid embedding indices.
+ Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will
+ be generated of completely masked latent pixels.
+ output_type (`str`, *optional*, defaults to `"pil"`):
+ The output format of the generated image. Choose between
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+ return_dict (`bool`, *optional*, defaults to `True`):
+ Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+ callback (`Callable`, *optional*):
+ A function that will be called every `callback_steps` steps during inference. The function will be
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+ callback_steps (`int`, *optional*, defaults to 1):
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
+ called at every step.
+
+ Returns:
+ [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~ pipeline_utils.ImagePipelineOutput `] if
+ `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+ generated images.
+ """
+ if isinstance(prompt, str):
+ batch_size = 1
+ elif isinstance(prompt, list):
+ batch_size = len(prompt)
+ else:
+ raise ValueError(
+ f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+ batch_size = batch_size * num_images_per_prompt
+
+ if (callback_steps is None) or (
+ callback_steps is not None and (not isinstance(
+ callback_steps, int) or callback_steps <= 0)
+ ):
+ raise ValueError(
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+ f" {type(callback_steps)}."
+ )
+
+ # get prompt text embeddings
+ text_inputs = self.tokenizer(
+ prompt,
+ padding="max_length",
+ max_length=self.tokenizer.model_max_length,
+ return_tensors="pt",
+ )
+ text_input_ids = text_inputs.input_ids
+
+ if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+ removed_text = self.tokenizer.batch_decode(
+ text_input_ids[:, self.tokenizer.model_max_length:])
+ logger.warning(
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ )
+ text_input_ids = text_input_ids[:,
+ : self.tokenizer.model_max_length]
+ text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+ # NOTE: This additional step of normalizing the text embeddings is from VQ-Diffusion.
+ # While CLIP does normalize the pooled output of the text transformer when combining
+ # the image and text embeddings, CLIP does not directly normalize the last hidden state.
+ #
+ # CLIP normalizing the pooled output.
+ # https://github.com/huggingface/transformers/blob/d92e22d1f28324f513f3080e5c47c071a3916721/src/transformers/models/clip/modeling_clip.py#L1052-L1053
+ text_embeddings = text_embeddings / \
+ text_embeddings.norm(dim=-1, keepdim=True)
+
+ # duplicate text embeddings for each generation per prompt
+ text_embeddings = text_embeddings.repeat_interleave(
+ num_images_per_prompt, dim=0)
+
+ # get the initial completely masked latents unless the user supplied it
+
+ latents_shape = (batch_size, self.transformer.num_latent_pixels)
+ if latents is None:
+ mask_class = self.transformer.num_vector_embeds - 1
+ latents = torch.full(latents_shape, mask_class).to(self.device)
+ else:
+ if latents.shape != latents_shape:
+ raise ValueError(
+ f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+ if (latents < 0).any() or (latents >= self.transformer.num_vector_embeds).any():
+ raise ValueError(
+ "Unexpected latents value(s). All latents be valid embedding indices i.e. in the range 0,"
+ f" {self.transformer.num_vector_embeds - 1} (inclusive)."
+ )
+ latents = latents.to(self.device)
+
+ # set timesteps
+ self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+
+ timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+ sample = latents
+
+ for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+ # predict the un-noised image
+ # model_output == `log_p_x_0`
+ model_output = self.transformer(
+ sample, encoder_hidden_states=text_embeddings, timestep=t).sample
+
+ model_output = self.truncate(model_output, truncation_rate)
+
+ # remove `log(0)`'s (`-inf`s)
+ model_output = model_output.clamp(-70)
+
+ # compute the previous noisy sample x_t -> x_t-1
+ sample = self.scheduler.step(
+ model_output, timestep=t, sample=sample, generator=generator).prev_sample
+
+ # call the callback, if provided
+ if callback is not None and i % callback_steps == 0:
+ callback(i, t, sample)
+
+ embedding_channels = self.vqvae.config.vq_embed_dim
+ embeddings_shape = (batch_size, self.transformer.height,
+ self.transformer.width, embedding_channels)
+ embeddings = self.vqvae.quantize.get_codebook_entry(
+ sample, shape=embeddings_shape)
+ image = self.vqvae.decode(embeddings, force_not_quantize=True).sample
+
+ image = (image / 2 + 0.5).clamp(0, 1)
+ image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+ if output_type == "pil":
+ image = self.numpy_to_pil(image)
+
+ if not return_dict:
+ return (image,)
+
+ return ImagePipelineOutput(images=image)
+
+ def truncate(self, log_p_x_0: torch.FloatTensor, truncation_rate: float) -> torch.FloatTensor:
+ """
+ Truncates log_p_x_0 such that for each column vector, the total cumulative probability is `truncation_rate` The
+ lowest probabilities that would increase the cumulative probability above `truncation_rate` are set to zero.
+ """
+ sorted_log_p_x_0, indices = torch.sort(log_p_x_0, 1, descending=True)
+ sorted_p_x_0 = torch.exp(sorted_log_p_x_0)
+ keep_mask = sorted_p_x_0.cumsum(dim=1) < truncation_rate
+
+ # Ensure that at least the largest probability is not zeroed out
+ all_true = torch.full_like(keep_mask[:, 0:1, :], True)
+ keep_mask = torch.cat((all_true, keep_mask), dim=1)
+ keep_mask = keep_mask[:, :-1, :]
+
+ keep_mask = keep_mask.gather(1, indices.argsort(1))
+
+ rv = log_p_x_0.clone()
+
+ rv[~keep_mask] = -torch.inf # -inf = log(0)
+
+ return rv
diff --git a/src/model/TextGen/diffusers/schedulers/README.md b/src/model/TextGen/diffusers/schedulers/README.md
new file mode 100644
index 0000000..9494e35
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/README.md
@@ -0,0 +1,3 @@
+# Schedulers
+
+For more information on the schedulers, please refer to the [docs](https://huggingface.co/docs/diffusers/api/schedulers).
\ No newline at end of file
diff --git a/src/model/TextGen/diffusers/schedulers/__init__.py b/src/model/TextGen/diffusers/schedulers/__init__.py
new file mode 100644
index 0000000..6217bfc
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/__init__.py
@@ -0,0 +1,52 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ..utils import is_flax_available, is_scipy_available, is_torch_available
+
+
+if is_torch_available():
+ from .scheduling_ddim import DDIMScheduler
+ from .scheduling_ddpm import DDPMScheduler
+ from .scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
+ from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
+ from .scheduling_euler_discrete import EulerDiscreteScheduler
+ from .scheduling_ipndm import IPNDMScheduler
+ from .scheduling_karras_ve import KarrasVeScheduler
+ from .scheduling_pndm import PNDMScheduler
+ from .scheduling_repaint import RePaintScheduler
+ from .scheduling_sde_ve import ScoreSdeVeScheduler
+ from .scheduling_sde_vp import ScoreSdeVpScheduler
+ from .scheduling_utils import SchedulerMixin
+ from .scheduling_vq_diffusion import VQDiffusionScheduler
+else:
+ from ..utils.dummy_pt_objects import * # noqa F403
+
+if is_flax_available():
+ from .scheduling_ddim_flax import FlaxDDIMScheduler
+ from .scheduling_ddpm_flax import FlaxDDPMScheduler
+ from .scheduling_dpmsolver_multistep_flax import FlaxDPMSolverMultistepScheduler
+ from .scheduling_karras_ve_flax import FlaxKarrasVeScheduler
+ from .scheduling_lms_discrete_flax import FlaxLMSDiscreteScheduler
+ from .scheduling_pndm_flax import FlaxPNDMScheduler
+ from .scheduling_sde_ve_flax import FlaxScoreSdeVeScheduler
+ from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+else:
+ from ..utils.dummy_flax_objects import * # noqa F403
+
+
+if is_scipy_available() and is_torch_available():
+ from .scheduling_lms_discrete import LMSDiscreteScheduler
+else:
+ from ..utils.dummy_torch_and_scipy_objects import * # noqa F403
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_ddim.py b/src/model/TextGen/diffusers/schedulers/scheduling_ddim.py
new file mode 100644
index 0000000..484d258
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_ddim.py
@@ -0,0 +1,352 @@
+# Copyright 2022 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class DDIMSchedulerOutput(BaseOutput):
+ """
+ Output class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+ `pred_original_sample` can be used to preview progress or for guidance.
+ """
+
+ prev_sample: torch.FloatTensor
+ pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+ (1-beta) over time from t = [0,1].
+
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+ to that part of the diffusion process.
+
+
+ Args:
+ num_diffusion_timesteps (`int`): the number of betas to produce.
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+
+ Returns:
+ betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+ """
+
+ def alpha_bar(time_step):
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return torch.tensor(betas)
+
+
+class DDIMScheduler(SchedulerMixin, ConfigMixin):
+ """
+ Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
+ diffusion probabilistic models (DDPMs) with non-Markovian guidance.
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ For more details, see the original paper: https://arxiv.org/abs/2010.02502
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ clip_sample (`bool`, default `True`):
+ option to clip predicted sample between -1 and 1 for numerical stability.
+ set_alpha_to_one (`bool`, default `True`):
+ each diffusion step uses the value of alphas product at that step and at the previous one. For the final
+ step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+ otherwise it uses the value of alpha at step 0.
+ steps_offset (`int`, default `0`):
+ an offset added to the inference steps. You can use a combination of `offset=1` and
+ `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
+ stable diffusion.
+
+ """
+
+ _compatible_classes = [
+ "PNDMScheduler",
+ "DDPMScheduler",
+ "LMSDiscreteScheduler",
+ "EulerDiscreteScheduler",
+ "EulerAncestralDiscreteScheduler",
+ "DPMSolverMultistepScheduler",
+ ]
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[np.ndarray] = None,
+ clip_sample: bool = True,
+ set_alpha_to_one: bool = True,
+ steps_offset: int = 0,
+ ):
+ if trained_betas is not None:
+ self.betas = torch.from_numpy(trained_betas)
+ elif beta_schedule == "linear":
+ self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = (
+ torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+ )
+ elif beta_schedule == "squaredcos_cap_v2":
+ # Glide cosine schedule
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+
+ # At every step in ddim, we are looking into the previous alphas_cumprod
+ # For the final step, there is no previous alphas_cumprod because we are already at 0
+ # `set_alpha_to_one` decides whether we set this parameter simply to one or
+ # whether we use the final alpha of the "non-previous" one.
+ self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = 1.0
+
+ # setable values
+ self.num_inference_steps = None
+ self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+
+ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+ """
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+
+ Args:
+ sample (`torch.FloatTensor`): input sample
+ timestep (`int`, optional): current timestep
+
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ return sample
+
+ def _get_variance(self, timestep, prev_timestep):
+
+ alpha_prod_t = self.alphas_cumprod[timestep]
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+ beta_prod_t = 1 - alpha_prod_t
+ beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+
+ variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+ # 由于bfloat16精度导致alpha累乘表中第一,二位的进位都是1,为避免inf而设置判断格式
+ if variance >= 0:
+ return variance
+ else:
+ return torch.zeros_like(variance)
+
+ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None, **kwargs):
+ """
+ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ """
+ self.num_inference_steps = num_inference_steps
+ step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+ # creates integer timesteps by multiplying by ratio
+ # casting to int to avoid issues when num_inference_step is power of 3
+ timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+ last_part_ratio = kwargs.get("last_part_ratio", 1.0)
+ timesteps = timesteps[-int(len(timesteps) * last_part_ratio):]
+ self.timesteps = torch.from_numpy(timesteps).to(device)
+ self.timesteps += self.config.steps_offset
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ sample: torch.FloatTensor,
+ eta: float = 0.0,
+ use_clipped_model_output: bool = False,
+ generator=None,
+ variance_noise: Optional[torch.FloatTensor] = None,
+ return_dict: bool = True,
+ ) -> Union[DDIMSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ eta (`float`): weight of noise for added noise in diffusion step.
+ use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
+ predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
+ `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
+ coincide with the one provided as input and `use_clipped_model_output` will have not effect.
+ generator: random number generator.
+ variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we
+ can directly provide the noise for the variance itself. This is useful for methods such as
+ CycleDiffusion. (https://arxiv.org/abs/2210.05559)
+ return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
+
+ Returns:
+ [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
+ [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+ returning a tuple, the first element is the sample tensor.
+
+ """
+ if self.num_inference_steps is None:
+ raise ValueError(
+ "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+ # Ideally, read DDIM paper in-detail understanding
+
+ # Notation ( ->
+ # - pred_noise_t -> e_theta(x_t, t)
+ # - pred_original_sample -> f_theta(x_t, t) or x_0
+ # - std_dev_t -> sigma_t
+ # - eta -> η
+ # - pred_sample_direction -> "direction pointing to x_t"
+ # - pred_prev_sample -> "x_t-1"
+
+ # 1. get previous step value (=t-1)
+ prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+ # 2. compute alphas, betas
+ alpha_prod_t = self.alphas_cumprod[timestep]
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+ beta_prod_t = 1 - alpha_prod_t
+
+ # 3. compute predicted original sample from predicted noise also called
+ # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+ pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+
+ # 4. Clip "predicted x_0"
+ if self.config.clip_sample:
+ pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+ # 5. compute variance: "sigma_t(η)" -> see formula (16)
+ # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+ variance = self._get_variance(timestep, prev_timestep)
+ std_dev_t = eta * variance ** (0.5)
+
+
+ if use_clipped_model_output:
+ # the model_output is always re-derived from the clipped x_0 in Glide
+ model_output = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+
+ # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+ pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * model_output
+
+ # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+ prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+
+
+
+ if eta > 0:
+ # randn_like does not support generator https://github.com/pytorch/pytorch/issues/27072
+ device = model_output.device
+ if variance_noise is not None and generator is not None:
+ raise ValueError(
+ "Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
+ " `variance_noise` stays `None`."
+ )
+
+ if variance_noise is None:
+ if device.type == "mps":
+ # randn does not work reproducibly on mps
+ variance_noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator)
+ variance_noise = variance_noise.to(device)
+ else:
+ variance_noise = torch.randn(
+ model_output.shape, generator=generator, device=device, dtype=model_output.dtype
+ )
+ variance = self._get_variance(timestep, prev_timestep) ** (0.5) * eta * variance_noise
+
+ prev_sample = prev_sample + variance
+
+ if not return_dict:
+ return (prev_sample,)
+
+ return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+ def add_noise(
+ self,
+ original_samples: torch.FloatTensor,
+ noise: torch.FloatTensor,
+ timesteps: torch.IntTensor,
+ ) -> torch.FloatTensor:
+ # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+ self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+ timesteps = timesteps.to(original_samples.device)
+
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+ while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+ sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_ddim_flax.py b/src/model/TextGen/diffusers/schedulers/scheduling_ddim_flax.py
new file mode 100644
index 0000000..590e3aa
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_ddim_flax.py
@@ -0,0 +1,291 @@
+# Copyright 2022 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> jnp.ndarray:
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+ (1-beta) over time from t = [0,1].
+
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+ to that part of the diffusion process.
+
+
+ Args:
+ num_diffusion_timesteps (`int`): the number of betas to produce.
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+
+ Returns:
+ betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
+ """
+
+ def alpha_bar(time_step):
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return jnp.array(betas, dtype=jnp.float32)
+
+
+@flax.struct.dataclass
+class DDIMSchedulerState:
+ # setable values
+ timesteps: jnp.ndarray
+ alphas_cumprod: jnp.ndarray
+ num_inference_steps: Optional[int] = None
+
+ @classmethod
+ def create(cls, num_train_timesteps: int, alphas_cumprod: jnp.ndarray):
+ return cls(timesteps=jnp.arange(0, num_train_timesteps)[::-1], alphas_cumprod=alphas_cumprod)
+
+
+@dataclass
+class FlaxDDIMSchedulerOutput(FlaxSchedulerOutput):
+ state: DDIMSchedulerState
+
+
+class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
+ """
+ Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
+ diffusion probabilistic models (DDPMs) with non-Markovian guidance.
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ For more details, see the original paper: https://arxiv.org/abs/2010.02502
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+ trained_betas (`jnp.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ clip_sample (`bool`, default `True`):
+ option to clip predicted sample between -1 and 1 for numerical stability.
+ set_alpha_to_one (`bool`, default `True`):
+ each diffusion step uses the value of alphas product at that step and at the previous one. For the final
+ step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+ otherwise it uses the value of alpha at step 0.
+ steps_offset (`int`, default `0`):
+ an offset added to the inference steps. You can use a combination of `offset=1` and
+ `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
+ stable diffusion.
+ """
+
+ @property
+ def has_state(self):
+ return True
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ set_alpha_to_one: bool = True,
+ steps_offset: int = 0,
+ ):
+ if beta_schedule == "linear":
+ self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2
+ elif beta_schedule == "squaredcos_cap_v2":
+ # Glide cosine schedule
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+
+ # HACK for now - clean up later (PVP)
+ self._alphas_cumprod = jnp.cumprod(self.alphas, axis=0)
+
+ # At every step in ddim, we are looking into the previous alphas_cumprod
+ # For the final step, there is no previous alphas_cumprod because we are already at 0
+ # `set_alpha_to_one` decides whether we set this parameter simply to one or
+ # whether we use the final alpha of the "non-previous" one.
+ self.final_alpha_cumprod = jnp.array(1.0) if set_alpha_to_one else float(self._alphas_cumprod[0])
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = 1.0
+
+ def scale_model_input(
+ self, state: DDIMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
+ ) -> jnp.ndarray:
+ """
+ Args:
+ state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+ sample (`jnp.ndarray`): input sample
+ timestep (`int`, optional): current timestep
+
+ Returns:
+ `jnp.ndarray`: scaled input sample
+ """
+ return sample
+
+ def create_state(self):
+ return DDIMSchedulerState.create(
+ num_train_timesteps=self.config.num_train_timesteps, alphas_cumprod=self._alphas_cumprod
+ )
+
+ def _get_variance(self, timestep, prev_timestep, alphas_cumprod):
+ alpha_prod_t = alphas_cumprod[timestep]
+ alpha_prod_t_prev = jnp.where(prev_timestep >= 0, alphas_cumprod[prev_timestep], self.final_alpha_cumprod)
+ beta_prod_t = 1 - alpha_prod_t
+ beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+ variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+ return variance
+
+ def set_timesteps(
+ self, state: DDIMSchedulerState, num_inference_steps: int, shape: Tuple = ()
+ ) -> DDIMSchedulerState:
+ """
+ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ state (`DDIMSchedulerState`):
+ the `FlaxDDIMScheduler` state data class instance.
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ """
+ offset = self.config.steps_offset
+
+ step_ratio = self.config.num_train_timesteps // num_inference_steps
+ # creates integer timesteps by multiplying by ratio
+ # casting to int to avoid issues when num_inference_step is power of 3
+ timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round()[::-1]
+ timesteps = timesteps + offset
+
+ return state.replace(num_inference_steps=num_inference_steps, timesteps=timesteps)
+
+ def step(
+ self,
+ state: DDIMSchedulerState,
+ model_output: jnp.ndarray,
+ timestep: int,
+ sample: jnp.ndarray,
+ return_dict: bool = True,
+ ) -> Union[FlaxDDIMSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ state (`DDIMSchedulerState`): the `FlaxDDIMScheduler` state data class instance.
+ model_output (`jnp.ndarray`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`jnp.ndarray`):
+ current instance of sample being created by diffusion process.
+ return_dict (`bool`): option for returning tuple rather than FlaxDDIMSchedulerOutput class
+
+ Returns:
+ [`FlaxDDIMSchedulerOutput`] or `tuple`: [`FlaxDDIMSchedulerOutput`] if `return_dict` is True, otherwise a
+ `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ if state.num_inference_steps is None:
+ raise ValueError(
+ "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+ # Ideally, read DDIM paper in-detail understanding
+
+ # Notation ( ->
+ # - pred_noise_t -> e_theta(x_t, t)
+ # - pred_original_sample -> f_theta(x_t, t) or x_0
+ # - std_dev_t -> sigma_t
+ # - eta -> η
+ # - pred_sample_direction -> "direction pointing to x_t"
+ # - pred_prev_sample -> "x_t-1"
+
+ # TODO(Patrick) - eta is always 0.0 for now, allow to be set in step function
+ eta = 0.0
+
+ # 1. get previous step value (=t-1)
+ prev_timestep = timestep - self.config.num_train_timesteps // state.num_inference_steps
+
+ alphas_cumprod = state.alphas_cumprod
+
+ # 2. compute alphas, betas
+ alpha_prod_t = alphas_cumprod[timestep]
+ alpha_prod_t_prev = jnp.where(prev_timestep >= 0, alphas_cumprod[prev_timestep], self.final_alpha_cumprod)
+
+ beta_prod_t = 1 - alpha_prod_t
+
+ # 3. compute predicted original sample from predicted noise also called
+ # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+ pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+
+ # 4. compute variance: "sigma_t(η)" -> see formula (16)
+ # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+ variance = self._get_variance(timestep, prev_timestep, alphas_cumprod)
+ std_dev_t = eta * variance ** (0.5)
+
+ # 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+ pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * model_output
+
+ # 6. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+ prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+ if not return_dict:
+ return (prev_sample, state)
+
+ return FlaxDDIMSchedulerOutput(prev_sample=prev_sample, state=state)
+
+ def add_noise(
+ self,
+ original_samples: jnp.ndarray,
+ noise: jnp.ndarray,
+ timesteps: jnp.ndarray,
+ ) -> jnp.ndarray:
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+ sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
+
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.0
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+ sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
+
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_ddpm.py b/src/model/TextGen/diffusers/schedulers/scheduling_ddpm.py
new file mode 100644
index 0000000..a19d918
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_ddpm.py
@@ -0,0 +1,337 @@
+# Copyright 2022 UC Berkeley Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, FrozenDict, register_to_config
+from ..utils import BaseOutput, deprecate
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class DDPMSchedulerOutput(BaseOutput):
+ """
+ Output class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+ `pred_original_sample` can be used to preview progress or for guidance.
+ """
+
+ prev_sample: torch.FloatTensor
+ pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+ (1-beta) over time from t = [0,1].
+
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+ to that part of the diffusion process.
+
+
+ Args:
+ num_diffusion_timesteps (`int`): the number of betas to produce.
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+
+ Returns:
+ betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+ """
+
+ def alpha_bar(time_step):
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return torch.tensor(betas, dtype=torch.float32)
+
+
+class DDPMScheduler(SchedulerMixin, ConfigMixin):
+ """
+ Denoising diffusion probabilistic models (DDPMs) explores the connections between denoising score matching and
+ Langevin dynamics sampling.
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ For more details, see the original paper: https://arxiv.org/abs/2006.11239
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ variance_type (`str`):
+ options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
+ `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
+ clip_sample (`bool`, default `True`):
+ option to clip predicted sample between -1 and 1 for numerical stability.
+ predict_epsilon (`bool`):
+ optional flag to use when the model predicts the noise (epsilon), or the samples instead of the noise.
+
+ """
+
+ _compatible_classes = [
+ "DDIMScheduler",
+ "PNDMScheduler",
+ "LMSDiscreteScheduler",
+ "EulerDiscreteScheduler",
+ "EulerAncestralDiscreteScheduler",
+ "DPMSolverMultistepScheduler",
+ ]
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[np.ndarray] = None,
+ variance_type: str = "fixed_small",
+ clip_sample: bool = True,
+ predict_epsilon: bool = True,
+ ):
+ if trained_betas is not None:
+ self.betas = torch.from_numpy(trained_betas)
+ elif beta_schedule == "linear":
+ self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = (
+ torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+ )
+ elif beta_schedule == "squaredcos_cap_v2":
+ # Glide cosine schedule
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
+ elif beta_schedule == "sigmoid":
+ # GeoDiff sigmoid schedule
+ betas = torch.linspace(-6, 6, num_train_timesteps)
+ self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+ self.one = torch.tensor(1.0)
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = 1.0
+
+ # setable values
+ self.num_inference_steps = None
+ self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+ self.variance_type = variance_type
+
+ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+ """
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+
+ Args:
+ sample (`torch.FloatTensor`): input sample
+ timestep (`int`, optional): current timestep
+
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ return sample
+
+ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+ """
+ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ """
+ num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps)
+ self.num_inference_steps = num_inference_steps
+ timesteps = np.arange(
+ 0, self.config.num_train_timesteps, self.config.num_train_timesteps // self.num_inference_steps
+ )[::-1].copy()
+ self.timesteps = torch.from_numpy(timesteps).to(device)
+
+ def _get_variance(self, t, predicted_variance=None, variance_type=None):
+ alpha_prod_t = self.alphas_cumprod[t]
+ alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one
+
+ # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+ # and sample from it to get previous sample
+ # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+ variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t]
+
+ if variance_type is None:
+ variance_type = self.config.variance_type
+
+ # hacks - were probably added for training stability
+ if variance_type == "fixed_small":
+ variance = torch.clamp(variance, min=1e-20)
+ # for rl-diffuser https://arxiv.org/abs/2205.09991
+ elif variance_type == "fixed_small_log":
+ variance = torch.log(torch.clamp(variance, min=1e-20))
+ elif variance_type == "fixed_large":
+ variance = self.betas[t]
+ elif variance_type == "fixed_large_log":
+ # Glide max_log
+ variance = torch.log(self.betas[t])
+ elif variance_type == "learned":
+ return predicted_variance
+ elif variance_type == "learned_range":
+ min_log = variance
+ max_log = self.betas[t]
+ frac = (predicted_variance + 1) / 2
+ variance = frac * max_log + (1 - frac) * min_log
+
+ return variance
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ sample: torch.FloatTensor,
+ generator=None,
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[DDPMSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ generator: random number generator.
+ return_dict (`bool`): option for returning tuple rather than DDPMSchedulerOutput class
+
+ Returns:
+ [`~schedulers.scheduling_utils.DDPMSchedulerOutput`] or `tuple`:
+ [`~schedulers.scheduling_utils.DDPMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+ returning a tuple, the first element is the sample tensor.
+
+ """
+ message = (
+ "Please make sure to instantiate your scheduler with `predict_epsilon` instead. E.g. `scheduler ="
+ " DDPMScheduler.from_config(, predict_epsilon=True)`."
+ )
+ predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
+ if predict_epsilon is not None and predict_epsilon != self.config.predict_epsilon:
+ new_config = dict(self.config)
+ new_config["predict_epsilon"] = predict_epsilon
+ self._internal_dict = FrozenDict(new_config)
+
+ t = timestep
+
+ if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
+ model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+ else:
+ predicted_variance = None
+
+ # 1. compute alphas, betas
+ alpha_prod_t = self.alphas_cumprod[t]
+ alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one
+ beta_prod_t = 1 - alpha_prod_t
+ beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+ # 2. compute predicted original sample from predicted noise also called
+ # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+ if self.config.predict_epsilon:
+ pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+ else:
+ pred_original_sample = model_output
+
+ # 3. Clip "predicted x_0"
+ if self.config.clip_sample:
+ pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+ # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+ # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+ pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * self.betas[t]) / beta_prod_t
+ current_sample_coeff = self.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+ # 5. Compute predicted previous sample µ_t
+ # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+ pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+ # 6. Add noise
+ variance = 0
+ if t > 0:
+ device = model_output.device
+ if device.type == "mps":
+ # randn does not work reproducibly on mps
+ variance_noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator)
+ variance_noise = variance_noise.to(device)
+ else:
+ variance_noise = torch.randn(
+ model_output.shape, generator=generator, device=device, dtype=model_output.dtype
+ )
+ variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
+
+ pred_prev_sample = pred_prev_sample + variance
+
+ if not return_dict:
+ return (pred_prev_sample,)
+
+ return DDPMSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+ def add_noise(
+ self,
+ original_samples: torch.FloatTensor,
+ noise: torch.FloatTensor,
+ timesteps: torch.IntTensor,
+ ) -> torch.FloatTensor:
+ # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+ self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+ timesteps = timesteps.to(original_samples.device)
+
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+ while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+ sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_ddpm_flax.py b/src/model/TextGen/diffusers/schedulers/scheduling_ddpm_flax.py
new file mode 100644
index 0000000..f1b04a0
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_ddpm_flax.py
@@ -0,0 +1,297 @@
+# Copyright 2022 UC Berkeley Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+from jax import random
+
+from ..configuration_utils import ConfigMixin, FrozenDict, register_to_config
+from ..utils import deprecate
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> jnp.ndarray:
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+ (1-beta) over time from t = [0,1].
+
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+ to that part of the diffusion process.
+
+
+ Args:
+ num_diffusion_timesteps (`int`): the number of betas to produce.
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+
+ Returns:
+ betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
+ """
+
+ def alpha_bar(time_step):
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return jnp.array(betas, dtype=jnp.float32)
+
+
+@flax.struct.dataclass
+class DDPMSchedulerState:
+ # setable values
+ timesteps: jnp.ndarray
+ num_inference_steps: Optional[int] = None
+
+ @classmethod
+ def create(cls, num_train_timesteps: int):
+ return cls(timesteps=jnp.arange(0, num_train_timesteps)[::-1])
+
+
+@dataclass
+class FlaxDDPMSchedulerOutput(FlaxSchedulerOutput):
+ state: DDPMSchedulerState
+
+
+class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
+ """
+ Denoising diffusion probabilistic models (DDPMs) explores the connections between denoising score matching and
+ Langevin dynamics sampling.
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ For more details, see the original paper: https://arxiv.org/abs/2006.11239
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ variance_type (`str`):
+ options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
+ `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
+ clip_sample (`bool`, default `True`):
+ option to clip predicted sample between -1 and 1 for numerical stability.
+ predict_epsilon (`bool`):
+ optional flag to use when the model predicts the noise (epsilon), or the samples instead of the noise.
+
+ """
+
+ @property
+ def has_state(self):
+ return True
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[jnp.ndarray] = None,
+ variance_type: str = "fixed_small",
+ clip_sample: bool = True,
+ predict_epsilon: bool = True,
+ ):
+ if trained_betas is not None:
+ self.betas = jnp.asarray(trained_betas)
+ elif beta_schedule == "linear":
+ self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2
+ elif beta_schedule == "squaredcos_cap_v2":
+ # Glide cosine schedule
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = jnp.cumprod(self.alphas, axis=0)
+ self.one = jnp.array(1.0)
+
+ def create_state(self):
+ return DDPMSchedulerState.create(num_train_timesteps=self.config.num_train_timesteps)
+
+ def set_timesteps(
+ self, state: DDPMSchedulerState, num_inference_steps: int, shape: Tuple = ()
+ ) -> DDPMSchedulerState:
+ """
+ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ state (`DDIMSchedulerState`):
+ the `FlaxDDPMScheduler` state data class instance.
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ """
+ num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps)
+ timesteps = jnp.arange(
+ 0, self.config.num_train_timesteps, self.config.num_train_timesteps // num_inference_steps
+ )[::-1]
+ return state.replace(num_inference_steps=num_inference_steps, timesteps=timesteps)
+
+ def _get_variance(self, t, predicted_variance=None, variance_type=None):
+ alpha_prod_t = self.alphas_cumprod[t]
+ alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one
+
+ # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+ # and sample from it to get previous sample
+ # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+ variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t]
+
+ if variance_type is None:
+ variance_type = self.config.variance_type
+
+ # hacks - were probably added for training stability
+ if variance_type == "fixed_small":
+ variance = jnp.clip(variance, a_min=1e-20)
+ # for rl-diffuser https://arxiv.org/abs/2205.09991
+ elif variance_type == "fixed_small_log":
+ variance = jnp.log(jnp.clip(variance, a_min=1e-20))
+ elif variance_type == "fixed_large":
+ variance = self.betas[t]
+ elif variance_type == "fixed_large_log":
+ # Glide max_log
+ variance = jnp.log(self.betas[t])
+ elif variance_type == "learned":
+ return predicted_variance
+ elif variance_type == "learned_range":
+ min_log = variance
+ max_log = self.betas[t]
+ frac = (predicted_variance + 1) / 2
+ variance = frac * max_log + (1 - frac) * min_log
+
+ return variance
+
+ def step(
+ self,
+ state: DDPMSchedulerState,
+ model_output: jnp.ndarray,
+ timestep: int,
+ sample: jnp.ndarray,
+ key: random.KeyArray,
+ predict_epsilon: bool = True,
+ return_dict: bool = True,
+ **kwargs,
+ ) -> Union[FlaxDDPMSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ state (`DDPMSchedulerState`): the `FlaxDDPMScheduler` state data class instance.
+ model_output (`jnp.ndarray`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`jnp.ndarray`):
+ current instance of sample being created by diffusion process.
+ key (`random.KeyArray`): a PRNG key.
+ return_dict (`bool`): option for returning tuple rather than FlaxDDPMSchedulerOutput class
+
+ Returns:
+ [`FlaxDDPMSchedulerOutput`] or `tuple`: [`FlaxDDPMSchedulerOutput`] if `return_dict` is True, otherwise a
+ `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ message = (
+ "Please make sure to instantiate your scheduler with `predict_epsilon` instead. E.g. `scheduler ="
+ " DDPMScheduler.from_config(, predict_epsilon=True)`."
+ )
+ predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
+ if predict_epsilon is not None and predict_epsilon != self.config.predict_epsilon:
+ new_config = dict(self.config)
+ new_config["predict_epsilon"] = predict_epsilon
+ self._internal_dict = FrozenDict(new_config)
+
+ t = timestep
+
+ if model_output.shape[1] == sample.shape[1] * 2 and self.config.variance_type in ["learned", "learned_range"]:
+ model_output, predicted_variance = jnp.split(model_output, sample.shape[1], axis=1)
+ else:
+ predicted_variance = None
+
+ # 1. compute alphas, betas
+ alpha_prod_t = self.alphas_cumprod[t]
+ alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one
+ beta_prod_t = 1 - alpha_prod_t
+ beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+ # 2. compute predicted original sample from predicted noise also called
+ # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+ if self.config.predict_epsilon:
+ pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+ else:
+ pred_original_sample = model_output
+
+ # 3. Clip "predicted x_0"
+ if self.config.clip_sample:
+ pred_original_sample = jnp.clip(pred_original_sample, -1, 1)
+
+ # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+ # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+ pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * self.betas[t]) / beta_prod_t
+ current_sample_coeff = self.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+ # 5. Compute predicted previous sample µ_t
+ # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+ pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+ # 6. Add noise
+ variance = 0
+ if t > 0:
+ key = random.split(key, num=1)
+ noise = random.normal(key=key, shape=model_output.shape)
+ variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * noise
+
+ pred_prev_sample = pred_prev_sample + variance
+
+ if not return_dict:
+ return (pred_prev_sample, state)
+
+ return FlaxDDPMSchedulerOutput(prev_sample=pred_prev_sample, state=state)
+
+ def add_noise(
+ self,
+ original_samples: jnp.ndarray,
+ noise: jnp.ndarray,
+ timesteps: jnp.ndarray,
+ ) -> jnp.ndarray:
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+ sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
+
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+ sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
+
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/model/TextGen/diffusers/schedulers/scheduling_dpmsolver_multistep.py
new file mode 100644
index 0000000..d166354
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -0,0 +1,506 @@
+# Copyright 2022 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import SchedulerMixin, SchedulerOutput
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+ (1-beta) over time from t = [0,1].
+
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+ to that part of the diffusion process.
+
+
+ Args:
+ num_diffusion_timesteps (`int`): the number of betas to produce.
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+
+ Returns:
+ betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+ """
+
+ def alpha_bar(time_step):
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return torch.tensor(betas, dtype=torch.float32)
+
+
+class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
+ """
+ DPM-Solver (and the improved version DPM-Solver++) is a fast dedicated high-order solver for diffusion ODEs with
+ the convergence order guarantee. Empirically, sampling by DPM-Solver with only 20 steps can generate high-quality
+ samples, and it can generate quite good samples even in only 10 steps.
+
+ For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
+
+ Currently, we support the multistep DPM-Solver for both noise prediction models and data prediction models. We
+ recommend to use `solver_order=2` for guided sampling, and `solver_order=3` for unconditional sampling.
+
+ We also support the "dynamic thresholding" method in Imagen (https://arxiv.org/abs/2205.11487). For pixel-space
+ diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use the dynamic
+ thresholding. Note that the thresholding method is unsuitable for latent-space diffusion models (such as
+ stable-diffusion).
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ solver_order (`int`, default `2`):
+ the order of DPM-Solver; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided
+ sampling, and `solver_order=3` for unconditional sampling.
+ predict_epsilon (`bool`, default `True`):
+ we currently support both the noise prediction model and the data prediction model. If the model predicts
+ the noise / epsilon, set `predict_epsilon` to `True`. If the model predicts the data / x0 directly, set
+ `predict_epsilon` to `False`.
+ thresholding (`bool`, default `False`):
+ whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
+ For pixel-space diffusion models, you can set both `algorithm_type=dpmsolver++` and `thresholding=True` to
+ use the dynamic thresholding. Note that the thresholding method is unsuitable for latent-space diffusion
+ models (such as stable-diffusion).
+ dynamic_thresholding_ratio (`float`, default `0.995`):
+ the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
+ (https://arxiv.org/abs/2205.11487).
+ sample_max_value (`float`, default `1.0`):
+ the threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+ `algorithm_type="dpmsolver++`.
+ algorithm_type (`str`, default `dpmsolver++`):
+ the algorithm type for the solver. Either `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the
+ algorithms in https://arxiv.org/abs/2206.00927, and the `dpmsolver++` type implements the algorithms in
+ https://arxiv.org/abs/2211.01095. We recommend to use `dpmsolver++` with `solver_order=2` for guided
+ sampling (e.g. stable-diffusion).
+ solver_type (`str`, default `midpoint`):
+ the solver type for the second-order solver. Either `midpoint` or `heun`. The solver type slightly affects
+ the sample quality, especially for small number of steps. We empirically find that `midpoint` solvers are
+ slightly better, so we recommend to use the `midpoint` type.
+ lower_order_final (`bool`, default `True`):
+ whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. We empirically
+ find this trick can stabilize the sampling of DPM-Solver for steps < 15, especially for steps <= 10.
+
+ """
+
+ _compatible_classes = [
+ "DDIMScheduler",
+ "DDPMScheduler",
+ "PNDMScheduler",
+ "LMSDiscreteScheduler",
+ "EulerDiscreteScheduler",
+ "EulerAncestralDiscreteScheduler",
+ ]
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[np.ndarray] = None,
+ solver_order: int = 2,
+ predict_epsilon: bool = True,
+ thresholding: bool = False,
+ dynamic_thresholding_ratio: float = 0.995,
+ sample_max_value: float = 1.0,
+ algorithm_type: str = "dpmsolver++",
+ solver_type: str = "midpoint",
+ lower_order_final: bool = True,
+ ):
+ if trained_betas is not None:
+ self.betas = torch.from_numpy(trained_betas)
+ elif beta_schedule == "linear":
+ self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = (
+ torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+ )
+ elif beta_schedule == "squaredcos_cap_v2":
+ # Glide cosine schedule
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+ # Currently we only support VP-type noise schedule
+ self.alpha_t = torch.sqrt(self.alphas_cumprod)
+ self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+ self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = 1.0
+
+ # settings for DPM-Solver
+ if algorithm_type not in ["dpmsolver", "dpmsolver++"]:
+ raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+ if solver_type not in ["midpoint", "heun"]:
+ raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
+
+ # setable values
+ self.num_inference_steps = None
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+ self.timesteps = torch.from_numpy(timesteps)
+ self.model_outputs = [None] * solver_order
+ self.lower_order_nums = 0
+
+ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+ """
+ Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ device (`str` or `torch.device`, optional):
+ the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ """
+ self.num_inference_steps = num_inference_steps
+ timesteps = (
+ np.linspace(0, self.num_train_timesteps - 1, num_inference_steps + 1)
+ .round()[::-1][:-1]
+ .copy()
+ .astype(np.int64)
+ )
+ self.timesteps = torch.from_numpy(timesteps).to(device)
+ self.model_outputs = [
+ None,
+ ] * self.config.solver_order
+ self.lower_order_nums = 0
+
+ def convert_model_output(
+ self, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor
+ ) -> torch.FloatTensor:
+ """
+ Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
+
+ DPM-Solver is designed to discretize an integral of the noise prediciton model, and DPM-Solver++ is designed to
+ discretize an integral of the data prediction model. So we need to first convert the model output to the
+ corresponding type to match the algorithm.
+
+ Note that the algorithm type and the model type is decoupled. That is to say, we can use either DPM-Solver or
+ DPM-Solver++ for both noise prediction model and data prediction model.
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+
+ Returns:
+ `torch.FloatTensor`: the converted model output.
+ """
+ # DPM-Solver++ needs to solve an integral of the data prediction model.
+ if self.config.algorithm_type == "dpmsolver++":
+ if self.config.predict_epsilon:
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+ x0_pred = (sample - sigma_t * model_output) / alpha_t
+ else:
+ x0_pred = model_output
+ if self.config.thresholding:
+ # Dynamic thresholding in https://arxiv.org/abs/2205.11487
+ dynamic_max_val = torch.quantile(
+ torch.abs(x0_pred).reshape((x0_pred.shape[0], -1)), self.config.dynamic_thresholding_ratio, dim=1
+ )
+ dynamic_max_val = torch.maximum(
+ dynamic_max_val,
+ self.config.sample_max_value * torch.ones_like(dynamic_max_val).to(dynamic_max_val.device),
+ )[(...,) + (None,) * (x0_pred.ndim - 1)]
+ x0_pred = torch.clamp(x0_pred, -dynamic_max_val, dynamic_max_val) / dynamic_max_val
+ return x0_pred
+ # DPM-Solver needs to solve an integral of the noise prediction model.
+ elif self.config.algorithm_type == "dpmsolver":
+ if self.config.predict_epsilon:
+ return model_output
+ else:
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+ epsilon = (sample - alpha_t * model_output) / sigma_t
+ return epsilon
+
+ def dpm_solver_first_order_update(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ prev_timestep: int,
+ sample: torch.FloatTensor,
+ ) -> torch.FloatTensor:
+ """
+ One step for the first-order DPM-Solver (equivalent to DDIM).
+
+ See https://arxiv.org/abs/2206.00927 for the detailed derivation.
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+
+ Returns:
+ `torch.FloatTensor`: the sample tensor at the previous timestep.
+ """
+ lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
+ alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
+ sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
+ h = lambda_t - lambda_s
+ if self.config.algorithm_type == "dpmsolver++":
+ x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+ elif self.config.algorithm_type == "dpmsolver":
+ x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+ return x_t
+
+ def multistep_dpm_solver_second_order_update(
+ self,
+ model_output_list: List[torch.FloatTensor],
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: torch.FloatTensor,
+ ) -> torch.FloatTensor:
+ """
+ One step for the second-order multistep DPM-Solver.
+
+ Args:
+ model_output_list (`List[torch.FloatTensor]`):
+ direct outputs from learned diffusion model at current and latter timesteps.
+ timestep (`int`): current and latter discrete timestep in the diffusion chain.
+ prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+
+ Returns:
+ `torch.FloatTensor`: the sample tensor at the previous timestep.
+ """
+ t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
+ m0, m1 = model_output_list[-1], model_output_list[-2]
+ lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
+ alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+ sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+ h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+ r0 = h_0 / h
+ D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+ if self.config.algorithm_type == "dpmsolver++":
+ # See https://arxiv.org/abs/2211.01095 for detailed derivations
+ if self.config.solver_type == "midpoint":
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+ - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+ )
+ elif self.config.solver_type == "heun":
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+ + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+ )
+ elif self.config.algorithm_type == "dpmsolver":
+ # See https://arxiv.org/abs/2206.00927 for detailed derivations
+ if self.config.solver_type == "midpoint":
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - (sigma_t * (torch.exp(h) - 1.0)) * D0
+ - 0.5 * (sigma_t * (torch.exp(h) - 1.0)) * D1
+ )
+ elif self.config.solver_type == "heun":
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - (sigma_t * (torch.exp(h) - 1.0)) * D0
+ - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+ )
+ return x_t
+
+ def multistep_dpm_solver_third_order_update(
+ self,
+ model_output_list: List[torch.FloatTensor],
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: torch.FloatTensor,
+ ) -> torch.FloatTensor:
+ """
+ One step for the third-order multistep DPM-Solver.
+
+ Args:
+ model_output_list (`List[torch.FloatTensor]`):
+ direct outputs from learned diffusion model at current and latter timesteps.
+ timestep (`int`): current and latter discrete timestep in the diffusion chain.
+ prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+
+ Returns:
+ `torch.FloatTensor`: the sample tensor at the previous timestep.
+ """
+ t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
+ m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+ lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
+ self.lambda_t[t],
+ self.lambda_t[s0],
+ self.lambda_t[s1],
+ self.lambda_t[s2],
+ )
+ alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+ sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+ h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+ r0, r1 = h_0 / h, h_1 / h
+ D0 = m0
+ D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+ D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+ D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+ if self.config.algorithm_type == "dpmsolver++":
+ # See https://arxiv.org/abs/2206.00927 for detailed derivations
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+ + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+ - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+ )
+ elif self.config.algorithm_type == "dpmsolver":
+ # See https://arxiv.org/abs/2206.00927 for detailed derivations
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - (sigma_t * (torch.exp(h) - 1.0)) * D0
+ - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+ - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+ )
+ return x_t
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ sample: torch.FloatTensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
+ """
+ Step function propagating the sample with the multistep DPM-Solver.
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
+
+ Returns:
+ [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
+ True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ if self.num_inference_steps is None:
+ raise ValueError(
+ "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ if isinstance(timestep, torch.Tensor):
+ timestep = timestep.to(self.timesteps.device)
+ step_index = (self.timesteps == timestep).nonzero()
+ if len(step_index) == 0:
+ step_index = len(self.timesteps) - 1
+ else:
+ step_index = step_index.item()
+ prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
+ lower_order_final = (
+ (step_index == len(self.timesteps) - 1) and self.config.lower_order_final and len(self.timesteps) < 15
+ )
+ lower_order_second = (
+ (step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+ )
+
+ model_output = self.convert_model_output(model_output, timestep, sample)
+ for i in range(self.config.solver_order - 1):
+ self.model_outputs[i] = self.model_outputs[i + 1]
+ self.model_outputs[-1] = model_output
+
+ if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+ prev_sample = self.dpm_solver_first_order_update(model_output, timestep, prev_timestep, sample)
+ elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+ timestep_list = [self.timesteps[step_index - 1], timestep]
+ prev_sample = self.multistep_dpm_solver_second_order_update(
+ self.model_outputs, timestep_list, prev_timestep, sample
+ )
+ else:
+ timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
+ prev_sample = self.multistep_dpm_solver_third_order_update(
+ self.model_outputs, timestep_list, prev_timestep, sample
+ )
+
+ if self.lower_order_nums < self.config.solver_order:
+ self.lower_order_nums += 1
+
+ if not return_dict:
+ return (prev_sample,)
+
+ return SchedulerOutput(prev_sample=prev_sample)
+
+ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+ """
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+
+ Args:
+ sample (`torch.FloatTensor`): input sample
+
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ return sample
+
+ def add_noise(
+ self,
+ original_samples: torch.FloatTensor,
+ noise: torch.FloatTensor,
+ timesteps: torch.IntTensor,
+ ) -> torch.FloatTensor:
+ # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+ self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+ timesteps = timesteps.to(original_samples.device)
+
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+ while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+ sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py b/src/model/TextGen/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
new file mode 100644
index 0000000..c9a6d1c
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
@@ -0,0 +1,590 @@
+# Copyright 2022 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps: int, max_beta=0.999) -> jnp.ndarray:
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+ (1-beta) over time from t = [0,1].
+
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+ to that part of the diffusion process.
+
+
+ Args:
+ num_diffusion_timesteps (`int`): the number of betas to produce.
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+
+ Returns:
+ betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
+ """
+
+ def alpha_bar(time_step):
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return jnp.array(betas, dtype=jnp.float32)
+
+
+@flax.struct.dataclass
+class DPMSolverMultistepSchedulerState:
+ # setable values
+ num_inference_steps: Optional[int] = None
+ timesteps: Optional[jnp.ndarray] = None
+
+ # running values
+ model_outputs: Optional[jnp.ndarray] = None
+ lower_order_nums: Optional[int] = None
+ step_index: Optional[int] = None
+ prev_timestep: Optional[int] = None
+ cur_sample: Optional[jnp.ndarray] = None
+
+ @classmethod
+ def create(cls, num_train_timesteps: int):
+ return cls(timesteps=jnp.arange(0, num_train_timesteps)[::-1])
+
+
+@dataclass
+class FlaxDPMSolverMultistepSchedulerOutput(FlaxSchedulerOutput):
+ state: DPMSolverMultistepSchedulerState
+
+
+class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
+ """
+ DPM-Solver (and the improved version DPM-Solver++) is a fast dedicated high-order solver for diffusion ODEs with
+ the convergence order guarantee. Empirically, sampling by DPM-Solver with only 20 steps can generate high-quality
+ samples, and it can generate quite good samples even in only 10 steps.
+
+ For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
+
+ Currently, we support the multistep DPM-Solver for both noise prediction models and data prediction models. We
+ recommend to use `solver_order=2` for guided sampling, and `solver_order=3` for unconditional sampling.
+
+ We also support the "dynamic thresholding" method in Imagen (https://arxiv.org/abs/2205.11487). For pixel-space
+ diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use the dynamic
+ thresholding. Note that the thresholding method is unsuitable for latent-space diffusion models (such as
+ stable-diffusion).
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ solver_order (`int`, default `2`):
+ the order of DPM-Solver; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided
+ sampling, and `solver_order=3` for unconditional sampling.
+ predict_epsilon (`bool`, default `True`):
+ we currently support both the noise prediction model and the data prediction model. If the model predicts
+ the noise / epsilon, set `predict_epsilon` to `True`. If the model predicts the data / x0 directly, set
+ `predict_epsilon` to `False`.
+ thresholding (`bool`, default `False`):
+ whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
+ For pixel-space diffusion models, you can set both `algorithm_type=dpmsolver++` and `thresholding=True` to
+ use the dynamic thresholding. Note that the thresholding method is unsuitable for latent-space diffusion
+ models (such as stable-diffusion).
+ dynamic_thresholding_ratio (`float`, default `0.995`):
+ the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
+ (https://arxiv.org/abs/2205.11487).
+ sample_max_value (`float`, default `1.0`):
+ the threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+ `algorithm_type="dpmsolver++`.
+ algorithm_type (`str`, default `dpmsolver++`):
+ the algorithm type for the solver. Either `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the
+ algorithms in https://arxiv.org/abs/2206.00927, and the `dpmsolver++` type implements the algorithms in
+ https://arxiv.org/abs/2211.01095. We recommend to use `dpmsolver++` with `solver_order=2` for guided
+ sampling (e.g. stable-diffusion).
+ solver_type (`str`, default `midpoint`):
+ the solver type for the second-order solver. Either `midpoint` or `heun`. The solver type slightly affects
+ the sample quality, especially for small number of steps. We empirically find that `midpoint` solvers are
+ slightly better, so we recommend to use the `midpoint` type.
+ lower_order_final (`bool`, default `True`):
+ whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. We empirically
+ find this trick can stabilize the sampling of DPM-Solver for steps < 15, especially for steps <= 10.
+
+ """
+
+ @property
+ def has_state(self):
+ return True
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[jnp.ndarray] = None,
+ solver_order: int = 2,
+ predict_epsilon: bool = True,
+ thresholding: bool = False,
+ dynamic_thresholding_ratio: float = 0.995,
+ sample_max_value: float = 1.0,
+ algorithm_type: str = "dpmsolver++",
+ solver_type: str = "midpoint",
+ lower_order_final: bool = True,
+ ):
+ if trained_betas is not None:
+ self.betas = jnp.asarray(trained_betas)
+ elif beta_schedule == "linear":
+ self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2
+ elif beta_schedule == "squaredcos_cap_v2":
+ # Glide cosine schedule
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = jnp.cumprod(self.alphas, axis=0)
+ # Currently we only support VP-type noise schedule
+ self.alpha_t = jnp.sqrt(self.alphas_cumprod)
+ self.sigma_t = jnp.sqrt(1 - self.alphas_cumprod)
+ self.lambda_t = jnp.log(self.alpha_t) - jnp.log(self.sigma_t)
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = 1.0
+
+ # settings for DPM-Solver
+ if algorithm_type not in ["dpmsolver", "dpmsolver++"]:
+ raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+ if solver_type not in ["midpoint", "heun"]:
+ raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
+
+ def create_state(self):
+ return DPMSolverMultistepSchedulerState.create(num_train_timesteps=self.config.num_train_timesteps)
+
+ def set_timesteps(
+ self, state: DPMSolverMultistepSchedulerState, num_inference_steps: int, shape: Tuple
+ ) -> DPMSolverMultistepSchedulerState:
+ """
+ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ state (`DPMSolverMultistepSchedulerState`):
+ the `FlaxDPMSolverMultistepScheduler` state data class instance.
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ shape (`Tuple`):
+ the shape of the samples to be generated.
+ """
+ timesteps = (
+ jnp.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+ .round()[::-1][:-1]
+ .astype(jnp.int32)
+ )
+
+ return state.replace(
+ num_inference_steps=num_inference_steps,
+ timesteps=timesteps,
+ model_outputs=jnp.zeros((self.config.solver_order,) + shape),
+ lower_order_nums=0,
+ step_index=0,
+ prev_timestep=-1,
+ cur_sample=jnp.zeros(shape),
+ )
+
+ def convert_model_output(
+ self,
+ model_output: jnp.ndarray,
+ timestep: int,
+ sample: jnp.ndarray,
+ ) -> jnp.ndarray:
+ """
+ Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
+
+ DPM-Solver is designed to discretize an integral of the noise prediciton model, and DPM-Solver++ is designed to
+ discretize an integral of the data prediction model. So we need to first convert the model output to the
+ corresponding type to match the algorithm.
+
+ Note that the algorithm type and the model type is decoupled. That is to say, we can use either DPM-Solver or
+ DPM-Solver++ for both noise prediction model and data prediction model.
+
+ Args:
+ model_output (`jnp.ndarray`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`jnp.ndarray`):
+ current instance of sample being created by diffusion process.
+
+ Returns:
+ `jnp.ndarray`: the converted model output.
+ """
+ # DPM-Solver++ needs to solve an integral of the data prediction model.
+ if self.config.algorithm_type == "dpmsolver++":
+ if self.config.predict_epsilon:
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+ x0_pred = (sample - sigma_t * model_output) / alpha_t
+ else:
+ x0_pred = model_output
+ if self.config.thresholding:
+ # Dynamic thresholding in https://arxiv.org/abs/2205.11487
+ dynamic_max_val = jnp.percentile(
+ jnp.abs(x0_pred), self.config.dynamic_thresholding_ratio, axis=tuple(range(1, x0_pred.ndim))
+ )
+ dynamic_max_val = jnp.maximum(
+ dynamic_max_val, self.config.sample_max_value * jnp.ones_like(dynamic_max_val)
+ )
+ x0_pred = jnp.clip(x0_pred, -dynamic_max_val, dynamic_max_val) / dynamic_max_val
+ return x0_pred
+ # DPM-Solver needs to solve an integral of the noise prediction model.
+ elif self.config.algorithm_type == "dpmsolver":
+ if self.config.predict_epsilon:
+ return model_output
+ else:
+ alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+ epsilon = (sample - alpha_t * model_output) / sigma_t
+ return epsilon
+
+ def dpm_solver_first_order_update(
+ self, model_output: jnp.ndarray, timestep: int, prev_timestep: int, sample: jnp.ndarray
+ ) -> jnp.ndarray:
+ """
+ One step for the first-order DPM-Solver (equivalent to DDIM).
+
+ See https://arxiv.org/abs/2206.00927 for the detailed derivation.
+
+ Args:
+ model_output (`jnp.ndarray`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+ sample (`jnp.ndarray`):
+ current instance of sample being created by diffusion process.
+
+ Returns:
+ `jnp.ndarray`: the sample tensor at the previous timestep.
+ """
+ t, s0 = prev_timestep, timestep
+ m0 = model_output
+ lambda_t, lambda_s = self.lambda_t[t], self.lambda_t[s0]
+ alpha_t, alpha_s = self.alpha_t[t], self.alpha_t[s0]
+ sigma_t, sigma_s = self.sigma_t[t], self.sigma_t[s0]
+ h = lambda_t - lambda_s
+ if self.config.algorithm_type == "dpmsolver++":
+ x_t = (sigma_t / sigma_s) * sample - (alpha_t * (jnp.exp(-h) - 1.0)) * m0
+ elif self.config.algorithm_type == "dpmsolver":
+ x_t = (alpha_t / alpha_s) * sample - (sigma_t * (jnp.exp(h) - 1.0)) * m0
+ return x_t
+
+ def multistep_dpm_solver_second_order_update(
+ self,
+ model_output_list: jnp.ndarray,
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: jnp.ndarray,
+ ) -> jnp.ndarray:
+ """
+ One step for the second-order multistep DPM-Solver.
+
+ Args:
+ model_output_list (`List[jnp.ndarray]`):
+ direct outputs from learned diffusion model at current and latter timesteps.
+ timestep (`int`): current and latter discrete timestep in the diffusion chain.
+ prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+ sample (`jnp.ndarray`):
+ current instance of sample being created by diffusion process.
+
+ Returns:
+ `jnp.ndarray`: the sample tensor at the previous timestep.
+ """
+ t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
+ m0, m1 = model_output_list[-1], model_output_list[-2]
+ lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
+ alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+ sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+ h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+ r0 = h_0 / h
+ D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+ if self.config.algorithm_type == "dpmsolver++":
+ # See https://arxiv.org/abs/2211.01095 for detailed derivations
+ if self.config.solver_type == "midpoint":
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (jnp.exp(-h) - 1.0)) * D0
+ - 0.5 * (alpha_t * (jnp.exp(-h) - 1.0)) * D1
+ )
+ elif self.config.solver_type == "heun":
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (jnp.exp(-h) - 1.0)) * D0
+ + (alpha_t * ((jnp.exp(-h) - 1.0) / h + 1.0)) * D1
+ )
+ elif self.config.algorithm_type == "dpmsolver":
+ # See https://arxiv.org/abs/2206.00927 for detailed derivations
+ if self.config.solver_type == "midpoint":
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - (sigma_t * (jnp.exp(h) - 1.0)) * D0
+ - 0.5 * (sigma_t * (jnp.exp(h) - 1.0)) * D1
+ )
+ elif self.config.solver_type == "heun":
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - (sigma_t * (jnp.exp(h) - 1.0)) * D0
+ - (sigma_t * ((jnp.exp(h) - 1.0) / h - 1.0)) * D1
+ )
+ return x_t
+
+ def multistep_dpm_solver_third_order_update(
+ self,
+ model_output_list: jnp.ndarray,
+ timestep_list: List[int],
+ prev_timestep: int,
+ sample: jnp.ndarray,
+ ) -> jnp.ndarray:
+ """
+ One step for the third-order multistep DPM-Solver.
+
+ Args:
+ model_output_list (`List[jnp.ndarray]`):
+ direct outputs from learned diffusion model at current and latter timesteps.
+ timestep (`int`): current and latter discrete timestep in the diffusion chain.
+ prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+ sample (`jnp.ndarray`):
+ current instance of sample being created by diffusion process.
+
+ Returns:
+ `jnp.ndarray`: the sample tensor at the previous timestep.
+ """
+ t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
+ m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+ lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
+ self.lambda_t[t],
+ self.lambda_t[s0],
+ self.lambda_t[s1],
+ self.lambda_t[s2],
+ )
+ alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+ sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+ h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+ r0, r1 = h_0 / h, h_1 / h
+ D0 = m0
+ D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+ D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+ D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+ if self.config.algorithm_type == "dpmsolver++":
+ # See https://arxiv.org/abs/2206.00927 for detailed derivations
+ x_t = (
+ (sigma_t / sigma_s0) * sample
+ - (alpha_t * (jnp.exp(-h) - 1.0)) * D0
+ + (alpha_t * ((jnp.exp(-h) - 1.0) / h + 1.0)) * D1
+ - (alpha_t * ((jnp.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+ )
+ elif self.config.algorithm_type == "dpmsolver":
+ # See https://arxiv.org/abs/2206.00927 for detailed derivations
+ x_t = (
+ (alpha_t / alpha_s0) * sample
+ - (sigma_t * (jnp.exp(h) - 1.0)) * D0
+ - (sigma_t * ((jnp.exp(h) - 1.0) / h - 1.0)) * D1
+ - (sigma_t * ((jnp.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+ )
+ return x_t
+
+ def step(
+ self,
+ state: DPMSolverMultistepSchedulerState,
+ model_output: jnp.ndarray,
+ timestep: int,
+ sample: jnp.ndarray,
+ return_dict: bool = True,
+ ) -> Union[FlaxDPMSolverMultistepSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by DPM-Solver. Core function to propagate the diffusion process
+ from the learned model outputs (most often the predicted noise).
+
+ Args:
+ state (`DPMSolverMultistepSchedulerState`):
+ the `FlaxDPMSolverMultistepScheduler` state data class instance.
+ model_output (`jnp.ndarray`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`jnp.ndarray`):
+ current instance of sample being created by diffusion process.
+ return_dict (`bool`): option for returning tuple rather than FlaxDPMSolverMultistepSchedulerOutput class
+
+ Returns:
+ [`FlaxDPMSolverMultistepSchedulerOutput`] or `tuple`: [`FlaxDPMSolverMultistepSchedulerOutput`] if
+ `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ prev_timestep = jax.lax.cond(
+ state.step_index == len(state.timesteps) - 1,
+ lambda _: 0,
+ lambda _: state.timesteps[state.step_index + 1],
+ (),
+ )
+
+ model_output = self.convert_model_output(model_output, timestep, sample)
+
+ model_outputs_new = jnp.roll(state.model_outputs, -1, axis=0)
+ model_outputs_new = model_outputs_new.at[-1].set(model_output)
+ state = state.replace(
+ model_outputs=model_outputs_new,
+ prev_timestep=prev_timestep,
+ cur_sample=sample,
+ )
+
+ def step_1(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+ return self.dpm_solver_first_order_update(
+ state.model_outputs[-1],
+ state.timesteps[state.step_index],
+ state.prev_timestep,
+ state.cur_sample,
+ )
+
+ def step_23(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+ def step_2(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+ timestep_list = jnp.array([state.timesteps[state.step_index - 1], state.timesteps[state.step_index]])
+ return self.multistep_dpm_solver_second_order_update(
+ state.model_outputs,
+ timestep_list,
+ state.prev_timestep,
+ state.cur_sample,
+ )
+
+ def step_3(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+ timestep_list = jnp.array(
+ [
+ state.timesteps[state.step_index - 2],
+ state.timesteps[state.step_index - 1],
+ state.timesteps[state.step_index],
+ ]
+ )
+ return self.multistep_dpm_solver_third_order_update(
+ state.model_outputs,
+ timestep_list,
+ state.prev_timestep,
+ state.cur_sample,
+ )
+
+ if self.config.solver_order == 2:
+ return step_2(state)
+ elif self.config.lower_order_final and len(state.timesteps) < 15:
+ return jax.lax.cond(
+ state.lower_order_nums < 2,
+ step_2,
+ lambda state: jax.lax.cond(
+ state.step_index == len(state.timesteps) - 2,
+ step_2,
+ step_3,
+ state,
+ ),
+ state,
+ )
+ else:
+ return jax.lax.cond(
+ state.lower_order_nums < 2,
+ step_2,
+ step_3,
+ state,
+ )
+
+ if self.config.solver_order == 1:
+ prev_sample = step_1(state)
+ elif self.config.lower_order_final and len(state.timesteps) < 15:
+ prev_sample = jax.lax.cond(
+ state.lower_order_nums < 1,
+ step_1,
+ lambda state: jax.lax.cond(
+ state.step_index == len(state.timesteps) - 1,
+ step_1,
+ step_23,
+ state,
+ ),
+ state,
+ )
+ else:
+ prev_sample = jax.lax.cond(
+ state.lower_order_nums < 1,
+ step_1,
+ step_23,
+ state,
+ )
+
+ state = state.replace(
+ lower_order_nums=jnp.minimum(state.lower_order_nums + 1, self.config.solver_order),
+ step_index=(state.step_index + 1),
+ )
+
+ if not return_dict:
+ return (prev_sample, state)
+
+ return FlaxDPMSolverMultistepSchedulerOutput(prev_sample=prev_sample, state=state)
+
+ def scale_model_input(
+ self, state: DPMSolverMultistepSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
+ ) -> jnp.ndarray:
+ """
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+
+ Args:
+ state (`DPMSolverMultistepSchedulerState`):
+ the `FlaxDPMSolverMultistepScheduler` state data class instance.
+ sample (`jnp.ndarray`): input sample
+ timestep (`int`, optional): current timestep
+
+ Returns:
+ `jnp.ndarray`: scaled input sample
+ """
+ return sample
+
+ def add_noise(
+ self,
+ original_samples: jnp.ndarray,
+ noise: jnp.ndarray,
+ timesteps: jnp.ndarray,
+ ) -> jnp.ndarray:
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+ sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
+
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.0
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+ sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
+
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/model/TextGen/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
new file mode 100644
index 0000000..621b5c1
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -0,0 +1,271 @@
+# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from .scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerAncestralDiscrete
+class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
+ """
+ Output class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+ `pred_original_sample` can be used to preview progress or for guidance.
+ """
+
+ prev_sample: torch.FloatTensor
+ pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
+ """
+ Ancestral sampling with Euler method steps. Based on the original k-diffusion implementation by Katherine Crowson:
+ https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear` or `scaled_linear`.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+
+ """
+
+ _compatible_classes = [
+ "DDIMScheduler",
+ "DDPMScheduler",
+ "LMSDiscreteScheduler",
+ "PNDMScheduler",
+ "EulerDiscreteScheduler",
+ "DPMSolverMultistepScheduler",
+ ]
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[np.ndarray] = None,
+ ):
+ if trained_betas is not None:
+ self.betas = torch.from_numpy(trained_betas)
+ elif beta_schedule == "linear":
+ self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = (
+ torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+ )
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+ self.sigmas = torch.from_numpy(sigmas)
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = self.sigmas.max()
+
+ # setable values
+ self.num_inference_steps = None
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
+ self.timesteps = torch.from_numpy(timesteps)
+ self.is_scale_input_called = False
+
+ def scale_model_input(
+ self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+ ) -> torch.FloatTensor:
+ """
+ Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+ Args:
+ sample (`torch.FloatTensor`): input sample
+ timestep (`float` or `torch.FloatTensor`): the current timestep in the diffusion chain
+
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ if isinstance(timestep, torch.Tensor):
+ timestep = timestep.to(self.timesteps.device)
+ step_index = (self.timesteps == timestep).nonzero().item()
+ sigma = self.sigmas[step_index]
+ sample = sample / ((sigma**2 + 1) ** 0.5)
+ self.is_scale_input_called = True
+ return sample
+
+ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+ """
+ Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ device (`str` or `torch.device`, optional):
+ the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ """
+ self.num_inference_steps = num_inference_steps
+
+ timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+ sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+ self.sigmas = torch.from_numpy(sigmas).to(device=device)
+ if str(device).startswith("mps"):
+ # mps does not support float64
+ self.timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32)
+ else:
+ self.timesteps = torch.from_numpy(timesteps).to(device=device)
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: Union[float, torch.FloatTensor],
+ sample: torch.FloatTensor,
+ generator: Optional[torch.Generator] = None,
+ return_dict: bool = True,
+ ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`float`): current timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ generator (`torch.Generator`, optional): Random number generator.
+ return_dict (`bool`): option for returning tuple rather than EulerAncestralDiscreteSchedulerOutput class
+
+ Returns:
+ [`~schedulers.scheduling_utils.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+ [`~schedulers.scheduling_utils.EulerAncestralDiscreteSchedulerOutput`] if `return_dict` is True, otherwise
+ a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+
+ if (
+ isinstance(timestep, int)
+ or isinstance(timestep, torch.IntTensor)
+ or isinstance(timestep, torch.LongTensor)
+ ):
+ raise ValueError(
+ "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+ " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+ " one of the `scheduler.timesteps` as a timestep.",
+ )
+
+ if not self.is_scale_input_called:
+ logger.warn(
+ "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+ "See `StableDiffusionPipeline` for a usage example."
+ )
+
+ if isinstance(timestep, torch.Tensor):
+ timestep = timestep.to(self.timesteps.device)
+
+ step_index = (self.timesteps == timestep).nonzero().item()
+ sigma = self.sigmas[step_index]
+
+ # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+ pred_original_sample = sample - sigma * model_output
+ sigma_from = self.sigmas[step_index]
+ sigma_to = self.sigmas[step_index + 1]
+ sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+ sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+
+ # 2. Convert to an ODE derivative
+ derivative = (sample - pred_original_sample) / sigma
+
+ dt = sigma_down - sigma
+
+ prev_sample = sample + derivative * dt
+
+ device = model_output.device
+ if device.type == "mps":
+ # randn does not work reproducibly on mps
+ noise = torch.randn(model_output.shape, dtype=model_output.dtype, device="cpu", generator=generator).to(
+ device
+ )
+ else:
+ noise = torch.randn(model_output.shape, dtype=model_output.dtype, device=device, generator=generator).to(
+ device
+ )
+
+ prev_sample = prev_sample + noise * sigma_up
+
+ if not return_dict:
+ return (prev_sample,)
+
+ return EulerAncestralDiscreteSchedulerOutput(
+ prev_sample=prev_sample, pred_original_sample=pred_original_sample
+ )
+
+ def add_noise(
+ self,
+ original_samples: torch.FloatTensor,
+ noise: torch.FloatTensor,
+ timesteps: torch.FloatTensor,
+ ) -> torch.FloatTensor:
+ # Make sure sigmas and timesteps have the same device and dtype as original_samples
+ self.sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+ if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+ # mps does not support float64
+ self.timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+ timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+ else:
+ self.timesteps = self.timesteps.to(original_samples.device)
+ timesteps = timesteps.to(original_samples.device)
+
+ schedule_timesteps = self.timesteps
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+ sigma = self.sigmas[step_indices].flatten()
+ while len(sigma.shape) < len(original_samples.shape):
+ sigma = sigma.unsqueeze(-1)
+
+ noisy_samples = original_samples + noise * sigma
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_euler_discrete.py b/src/model/TextGen/diffusers/schedulers/scheduling_euler_discrete.py
new file mode 100644
index 0000000..2f9e938
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_euler_discrete.py
@@ -0,0 +1,280 @@
+# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from .scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerDiscrete
+class EulerDiscreteSchedulerOutput(BaseOutput):
+ """
+ Output class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+ `pred_original_sample` can be used to preview progress or for guidance.
+ """
+
+ prev_sample: torch.FloatTensor
+ pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+ """
+ Euler scheduler (Algorithm 2) from Karras et al. (2022) https://arxiv.org/abs/2206.00364. . Based on the original
+ k-diffusion implementation by Katherine Crowson:
+ https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear` or `scaled_linear`.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+
+ """
+
+ _compatible_classes = [
+ "DDIMScheduler",
+ "DDPMScheduler",
+ "LMSDiscreteScheduler",
+ "PNDMScheduler",
+ "EulerAncestralDiscreteScheduler",
+ "DPMSolverMultistepScheduler",
+ ]
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[np.ndarray] = None,
+ ):
+ if trained_betas is not None:
+ self.betas = torch.from_numpy(trained_betas)
+ elif beta_schedule == "linear":
+ self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = (
+ torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+ )
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+ self.sigmas = torch.from_numpy(sigmas)
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = self.sigmas.max()
+
+ # setable values
+ self.num_inference_steps = None
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
+ self.timesteps = torch.from_numpy(timesteps)
+ self.is_scale_input_called = False
+
+ def scale_model_input(
+ self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+ ) -> torch.FloatTensor:
+ """
+ Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+ Args:
+ sample (`torch.FloatTensor`): input sample
+ timestep (`float` or `torch.FloatTensor`): the current timestep in the diffusion chain
+
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ if isinstance(timestep, torch.Tensor):
+ timestep = timestep.to(self.timesteps.device)
+ step_index = (self.timesteps == timestep).nonzero().item()
+ sigma = self.sigmas[step_index]
+ sample = sample / ((sigma**2 + 1) ** 0.5)
+ self.is_scale_input_called = True
+ return sample
+
+ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+ """
+ Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ device (`str` or `torch.device`, optional):
+ the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ """
+ self.num_inference_steps = num_inference_steps
+
+ timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+ sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+ self.sigmas = torch.from_numpy(sigmas).to(device=device)
+ if str(device).startswith("mps"):
+ # mps does not support float64
+ self.timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32)
+ else:
+ self.timesteps = torch.from_numpy(timesteps).to(device=device)
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: Union[float, torch.FloatTensor],
+ sample: torch.FloatTensor,
+ s_churn: float = 0.0,
+ s_tmin: float = 0.0,
+ s_tmax: float = float("inf"),
+ s_noise: float = 1.0,
+ generator: Optional[torch.Generator] = None,
+ return_dict: bool = True,
+ ) -> Union[EulerDiscreteSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`float`): current timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ s_churn (`float`)
+ s_tmin (`float`)
+ s_tmax (`float`)
+ s_noise (`float`)
+ generator (`torch.Generator`, optional): Random number generator.
+ return_dict (`bool`): option for returning tuple rather than EulerDiscreteSchedulerOutput class
+
+ Returns:
+ [`~schedulers.scheduling_utils.EulerDiscreteSchedulerOutput`] or `tuple`:
+ [`~schedulers.scheduling_utils.EulerDiscreteSchedulerOutput`] if `return_dict` is True, otherwise a
+ `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+
+ if (
+ isinstance(timestep, int)
+ or isinstance(timestep, torch.IntTensor)
+ or isinstance(timestep, torch.LongTensor)
+ ):
+ raise ValueError(
+ "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+ " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+ " one of the `scheduler.timesteps` as a timestep.",
+ )
+
+ if not self.is_scale_input_called:
+ logger.warn(
+ "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+ "See `StableDiffusionPipeline` for a usage example."
+ )
+
+ if isinstance(timestep, torch.Tensor):
+ timestep = timestep.to(self.timesteps.device)
+
+ step_index = (self.timesteps == timestep).nonzero().item()
+ sigma = self.sigmas[step_index]
+
+ gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
+
+ device = model_output.device
+ if device.type == "mps":
+ # randn does not work reproducibly on mps
+ noise = torch.randn(model_output.shape, dtype=model_output.dtype, device="cpu", generator=generator).to(
+ device
+ )
+ else:
+ noise = torch.randn(model_output.shape, dtype=model_output.dtype, device=device, generator=generator).to(
+ device
+ )
+
+ eps = noise * s_noise
+ sigma_hat = sigma * (gamma + 1)
+
+ if gamma > 0:
+ sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
+
+ # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+ pred_original_sample = sample - sigma_hat * model_output
+
+ # 2. Convert to an ODE derivative
+ derivative = (sample - pred_original_sample) / sigma_hat
+
+ dt = self.sigmas[step_index + 1] - sigma_hat
+
+ prev_sample = sample + derivative * dt
+
+ if not return_dict:
+ return (prev_sample,)
+
+ return EulerDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+ def add_noise(
+ self,
+ original_samples: torch.FloatTensor,
+ noise: torch.FloatTensor,
+ timesteps: torch.FloatTensor,
+ ) -> torch.FloatTensor:
+ # Make sure sigmas and timesteps have the same device and dtype as original_samples
+ self.sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+ if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+ # mps does not support float64
+ self.timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+ timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+ else:
+ self.timesteps = self.timesteps.to(original_samples.device)
+ timesteps = timesteps.to(original_samples.device)
+
+ schedule_timesteps = self.timesteps
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+ sigma = self.sigmas[step_indices].flatten()
+ while len(sigma.shape) < len(original_samples.shape):
+ sigma = sigma.unsqueeze(-1)
+
+ noisy_samples = original_samples + noise * sigma
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_ipndm.py b/src/model/TextGen/diffusers/schedulers/scheduling_ipndm.py
new file mode 100644
index 0000000..fb413a2
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_ipndm.py
@@ -0,0 +1,152 @@
+# Copyright 2022 Zhejiang University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Tuple, Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import SchedulerMixin, SchedulerOutput
+
+
+class IPNDMScheduler(SchedulerMixin, ConfigMixin):
+ """
+ Improved Pseudo numerical methods for diffusion models (iPNDM) ported from @crowsonkb's amazing k-diffusion
+ [library](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296)
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ For more details, see the original paper: https://arxiv.org/abs/2202.09778
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ """
+
+ @register_to_config
+ def __init__(self, num_train_timesteps: int = 1000):
+ # set `betas`, `alphas`, `timesteps`
+ self.set_timesteps(num_train_timesteps)
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = 1.0
+
+ # For now we only support F-PNDM, i.e. the runge-kutta method
+ # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
+ # mainly at formula (9), (12), (13) and the Algorithm 2.
+ self.pndm_order = 4
+
+ # running values
+ self.ets = []
+
+ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+ """
+ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ """
+ self.num_inference_steps = num_inference_steps
+ steps = torch.linspace(1, 0, num_inference_steps + 1)[:-1]
+ steps = torch.cat([steps, torch.tensor([0.0])])
+
+ self.betas = torch.sin(steps * math.pi / 2) ** 2
+ self.alphas = (1.0 - self.betas**2) ** 0.5
+
+ timesteps = (torch.atan2(self.betas, self.alphas) / math.pi * 2)[:-1]
+ self.timesteps = timesteps.to(device)
+
+ self.ets = []
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ sample: torch.FloatTensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
+ """
+ Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
+ times to approximate the solution.
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
+
+ Returns:
+ [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
+ True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ if self.num_inference_steps is None:
+ raise ValueError(
+ "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ timestep_index = (self.timesteps == timestep).nonzero().item()
+ prev_timestep_index = timestep_index + 1
+
+ ets = sample * self.betas[timestep_index] + model_output * self.alphas[timestep_index]
+ self.ets.append(ets)
+
+ if len(self.ets) == 1:
+ ets = self.ets[-1]
+ elif len(self.ets) == 2:
+ ets = (3 * self.ets[-1] - self.ets[-2]) / 2
+ elif len(self.ets) == 3:
+ ets = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
+ else:
+ ets = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4])
+
+ prev_sample = self._get_prev_sample(sample, timestep_index, prev_timestep_index, ets)
+
+ if not return_dict:
+ return (prev_sample,)
+
+ return SchedulerOutput(prev_sample=prev_sample)
+
+ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+ """
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+
+ Args:
+ sample (`torch.FloatTensor`): input sample
+
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ return sample
+
+ def _get_prev_sample(self, sample, timestep_index, prev_timestep_index, ets):
+ alpha = self.alphas[timestep_index]
+ sigma = self.betas[timestep_index]
+
+ next_alpha = self.alphas[prev_timestep_index]
+ next_sigma = self.betas[prev_timestep_index]
+
+ pred = (sample - sigma * ets) / max(alpha, 1e-8)
+ prev_sample = next_alpha * pred + ets * next_sigma
+
+ return prev_sample
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_karras_ve.py b/src/model/TextGen/diffusers/schedulers/scheduling_karras_ve.py
new file mode 100644
index 0000000..743f2e0
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_karras_ve.py
@@ -0,0 +1,230 @@
+# Copyright 2022 NVIDIA and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class KarrasVeOutput(BaseOutput):
+ """
+ Output class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ derivative (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Derivative of predicted original image sample (x_0).
+ pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+ `pred_original_sample` can be used to preview progress or for guidance.
+ """
+
+ prev_sample: torch.FloatTensor
+ derivative: torch.FloatTensor
+ pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+class KarrasVeScheduler(SchedulerMixin, ConfigMixin):
+ """
+ Stochastic sampling from Karras et al. [1] tailored to the Variance-Expanding (VE) models [2]. Use Algorithm 2 and
+ the VE column of Table 1 from [1] for reference.
+
+ [1] Karras, Tero, et al. "Elucidating the Design Space of Diffusion-Based Generative Models."
+ https://arxiv.org/abs/2206.00364 [2] Song, Yang, et al. "Score-based generative modeling through stochastic
+ differential equations." https://arxiv.org/abs/2011.13456
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ For more details on the parameters, see the original paper's Appendix E.: "Elucidating the Design Space of
+ Diffusion-Based Generative Models." https://arxiv.org/abs/2206.00364. The grid search values used to find the
+ optimal {s_noise, s_churn, s_min, s_max} for a specific model are described in Table 5 of the paper.
+
+ Args:
+ sigma_min (`float`): minimum noise magnitude
+ sigma_max (`float`): maximum noise magnitude
+ s_noise (`float`): the amount of additional noise to counteract loss of detail during sampling.
+ A reasonable range is [1.000, 1.011].
+ s_churn (`float`): the parameter controlling the overall amount of stochasticity.
+ A reasonable range is [0, 100].
+ s_min (`float`): the start value of the sigma range where we add noise (enable stochasticity).
+ A reasonable range is [0, 10].
+ s_max (`float`): the end value of the sigma range where we add noise.
+ A reasonable range is [0.2, 80].
+
+ """
+
+ @register_to_config
+ def __init__(
+ self,
+ sigma_min: float = 0.02,
+ sigma_max: float = 100,
+ s_noise: float = 1.007,
+ s_churn: float = 80,
+ s_min: float = 0.05,
+ s_max: float = 50,
+ ):
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = sigma_max
+
+ # setable values
+ self.num_inference_steps: int = None
+ self.timesteps: np.IntTensor = None
+ self.schedule: torch.FloatTensor = None # sigma(t_i)
+
+ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+ """
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+
+ Args:
+ sample (`torch.FloatTensor`): input sample
+ timestep (`int`, optional): current timestep
+
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ return sample
+
+ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+ """
+ Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+
+ """
+ self.num_inference_steps = num_inference_steps
+ timesteps = np.arange(0, self.num_inference_steps)[::-1].copy()
+ self.timesteps = torch.from_numpy(timesteps).to(device)
+ schedule = [
+ (
+ self.config.sigma_max**2
+ * (self.config.sigma_min**2 / self.config.sigma_max**2) ** (i / (num_inference_steps - 1))
+ )
+ for i in self.timesteps
+ ]
+ self.schedule = torch.tensor(schedule, dtype=torch.float32, device=device)
+
+ def add_noise_to_input(
+ self, sample: torch.FloatTensor, sigma: float, generator: Optional[torch.Generator] = None
+ ) -> Tuple[torch.FloatTensor, float]:
+ """
+ Explicit Langevin-like "churn" step of adding noise to the sample according to a factor gamma_i ≥ 0 to reach a
+ higher noise level sigma_hat = sigma_i + gamma_i*sigma_i.
+
+ TODO Args:
+ """
+ if self.config.s_min <= sigma <= self.config.s_max:
+ gamma = min(self.config.s_churn / self.num_inference_steps, 2**0.5 - 1)
+ else:
+ gamma = 0
+
+ # sample eps ~ N(0, S_noise^2 * I)
+ eps = self.config.s_noise * torch.randn(sample.shape, generator=generator).to(sample.device)
+ sigma_hat = sigma + gamma * sigma
+ sample_hat = sample + ((sigma_hat**2 - sigma**2) ** 0.5 * eps)
+
+ return sample_hat, sigma_hat
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ sigma_hat: float,
+ sigma_prev: float,
+ sample_hat: torch.FloatTensor,
+ return_dict: bool = True,
+ ) -> Union[KarrasVeOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ sigma_hat (`float`): TODO
+ sigma_prev (`float`): TODO
+ sample_hat (`torch.FloatTensor`): TODO
+ return_dict (`bool`): option for returning tuple rather than KarrasVeOutput class
+
+ KarrasVeOutput: updated sample in the diffusion chain and derivative (TODO double check).
+ Returns:
+ [`~schedulers.scheduling_karras_ve.KarrasVeOutput`] or `tuple`:
+ [`~schedulers.scheduling_karras_ve.KarrasVeOutput`] if `return_dict` is True, otherwise a `tuple`. When
+ returning a tuple, the first element is the sample tensor.
+
+ """
+
+ pred_original_sample = sample_hat + sigma_hat * model_output
+ derivative = (sample_hat - pred_original_sample) / sigma_hat
+ sample_prev = sample_hat + (sigma_prev - sigma_hat) * derivative
+
+ if not return_dict:
+ return (sample_prev, derivative)
+
+ return KarrasVeOutput(
+ prev_sample=sample_prev, derivative=derivative, pred_original_sample=pred_original_sample
+ )
+
+ def step_correct(
+ self,
+ model_output: torch.FloatTensor,
+ sigma_hat: float,
+ sigma_prev: float,
+ sample_hat: torch.FloatTensor,
+ sample_prev: torch.FloatTensor,
+ derivative: torch.FloatTensor,
+ return_dict: bool = True,
+ ) -> Union[KarrasVeOutput, Tuple]:
+ """
+ Correct the predicted sample based on the output model_output of the network. TODO complete description
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ sigma_hat (`float`): TODO
+ sigma_prev (`float`): TODO
+ sample_hat (`torch.FloatTensor`): TODO
+ sample_prev (`torch.FloatTensor`): TODO
+ derivative (`torch.FloatTensor`): TODO
+ return_dict (`bool`): option for returning tuple rather than KarrasVeOutput class
+
+ Returns:
+ prev_sample (TODO): updated sample in the diffusion chain. derivative (TODO): TODO
+
+ """
+ pred_original_sample = sample_prev + sigma_prev * model_output
+ derivative_corr = (sample_prev - pred_original_sample) / sigma_prev
+ sample_prev = sample_hat + (sigma_prev - sigma_hat) * (0.5 * derivative + 0.5 * derivative_corr)
+
+ if not return_dict:
+ return (sample_prev, derivative)
+
+ return KarrasVeOutput(
+ prev_sample=sample_prev, derivative=derivative, pred_original_sample=pred_original_sample
+ )
+
+ def add_noise(self, original_samples, noise, timesteps):
+ raise NotImplementedError()
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_karras_ve_flax.py b/src/model/TextGen/diffusers/schedulers/scheduling_karras_ve_flax.py
new file mode 100644
index 0000000..78ab007
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_karras_ve_flax.py
@@ -0,0 +1,237 @@
+# Copyright 2022 NVIDIA and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+from jax import random
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils_flax import FlaxSchedulerMixin
+
+
+@flax.struct.dataclass
+class KarrasVeSchedulerState:
+ # setable values
+ num_inference_steps: Optional[int] = None
+ timesteps: Optional[jnp.ndarray] = None
+ schedule: Optional[jnp.ndarray] = None # sigma(t_i)
+
+ @classmethod
+ def create(cls):
+ return cls()
+
+
+@dataclass
+class FlaxKarrasVeOutput(BaseOutput):
+ """
+ Output class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ derivative (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+ Derivative of predicted original image sample (x_0).
+ state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class.
+ """
+
+ prev_sample: jnp.ndarray
+ derivative: jnp.ndarray
+ state: KarrasVeSchedulerState
+
+
+class FlaxKarrasVeScheduler(FlaxSchedulerMixin, ConfigMixin):
+ """
+ Stochastic sampling from Karras et al. [1] tailored to the Variance-Expanding (VE) models [2]. Use Algorithm 2 and
+ the VE column of Table 1 from [1] for reference.
+
+ [1] Karras, Tero, et al. "Elucidating the Design Space of Diffusion-Based Generative Models."
+ https://arxiv.org/abs/2206.00364 [2] Song, Yang, et al. "Score-based generative modeling through stochastic
+ differential equations." https://arxiv.org/abs/2011.13456
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ For more details on the parameters, see the original paper's Appendix E.: "Elucidating the Design Space of
+ Diffusion-Based Generative Models." https://arxiv.org/abs/2206.00364. The grid search values used to find the
+ optimal {s_noise, s_churn, s_min, s_max} for a specific model are described in Table 5 of the paper.
+
+ Args:
+ sigma_min (`float`): minimum noise magnitude
+ sigma_max (`float`): maximum noise magnitude
+ s_noise (`float`): the amount of additional noise to counteract loss of detail during sampling.
+ A reasonable range is [1.000, 1.011].
+ s_churn (`float`): the parameter controlling the overall amount of stochasticity.
+ A reasonable range is [0, 100].
+ s_min (`float`): the start value of the sigma range where we add noise (enable stochasticity).
+ A reasonable range is [0, 10].
+ s_max (`float`): the end value of the sigma range where we add noise.
+ A reasonable range is [0.2, 80].
+ """
+
+ @property
+ def has_state(self):
+ return True
+
+ @register_to_config
+ def __init__(
+ self,
+ sigma_min: float = 0.02,
+ sigma_max: float = 100,
+ s_noise: float = 1.007,
+ s_churn: float = 80,
+ s_min: float = 0.05,
+ s_max: float = 50,
+ ):
+ pass
+
+ def create_state(self):
+ return KarrasVeSchedulerState.create()
+
+ def set_timesteps(
+ self, state: KarrasVeSchedulerState, num_inference_steps: int, shape: Tuple = ()
+ ) -> KarrasVeSchedulerState:
+ """
+ Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ state (`KarrasVeSchedulerState`):
+ the `FlaxKarrasVeScheduler` state data class.
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+
+ """
+ timesteps = jnp.arange(0, num_inference_steps)[::-1].copy()
+ schedule = [
+ (
+ self.config.sigma_max**2
+ * (self.config.sigma_min**2 / self.config.sigma_max**2) ** (i / (num_inference_steps - 1))
+ )
+ for i in timesteps
+ ]
+
+ return state.replace(
+ num_inference_steps=num_inference_steps,
+ schedule=jnp.array(schedule, dtype=jnp.float32),
+ timesteps=timesteps,
+ )
+
+ def add_noise_to_input(
+ self,
+ state: KarrasVeSchedulerState,
+ sample: jnp.ndarray,
+ sigma: float,
+ key: random.KeyArray,
+ ) -> Tuple[jnp.ndarray, float]:
+ """
+ Explicit Langevin-like "churn" step of adding noise to the sample according to a factor gamma_i ≥ 0 to reach a
+ higher noise level sigma_hat = sigma_i + gamma_i*sigma_i.
+
+ TODO Args:
+ """
+ if self.config.s_min <= sigma <= self.config.s_max:
+ gamma = min(self.config.s_churn / state.num_inference_steps, 2**0.5 - 1)
+ else:
+ gamma = 0
+
+ # sample eps ~ N(0, S_noise^2 * I)
+ key = random.split(key, num=1)
+ eps = self.config.s_noise * random.normal(key=key, shape=sample.shape)
+ sigma_hat = sigma + gamma * sigma
+ sample_hat = sample + ((sigma_hat**2 - sigma**2) ** 0.5 * eps)
+
+ return sample_hat, sigma_hat
+
+ def step(
+ self,
+ state: KarrasVeSchedulerState,
+ model_output: jnp.ndarray,
+ sigma_hat: float,
+ sigma_prev: float,
+ sample_hat: jnp.ndarray,
+ return_dict: bool = True,
+ ) -> Union[FlaxKarrasVeOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class.
+ model_output (`torch.FloatTensor` or `np.ndarray`): direct output from learned diffusion model.
+ sigma_hat (`float`): TODO
+ sigma_prev (`float`): TODO
+ sample_hat (`torch.FloatTensor` or `np.ndarray`): TODO
+ return_dict (`bool`): option for returning tuple rather than FlaxKarrasVeOutput class
+
+ Returns:
+ [`~schedulers.scheduling_karras_ve_flax.FlaxKarrasVeOutput`] or `tuple`: Updated sample in the diffusion
+ chain and derivative. [`~schedulers.scheduling_karras_ve_flax.FlaxKarrasVeOutput`] if `return_dict` is
+ True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+ """
+
+ pred_original_sample = sample_hat + sigma_hat * model_output
+ derivative = (sample_hat - pred_original_sample) / sigma_hat
+ sample_prev = sample_hat + (sigma_prev - sigma_hat) * derivative
+
+ if not return_dict:
+ return (sample_prev, derivative, state)
+
+ return FlaxKarrasVeOutput(prev_sample=sample_prev, derivative=derivative, state=state)
+
+ def step_correct(
+ self,
+ state: KarrasVeSchedulerState,
+ model_output: jnp.ndarray,
+ sigma_hat: float,
+ sigma_prev: float,
+ sample_hat: jnp.ndarray,
+ sample_prev: jnp.ndarray,
+ derivative: jnp.ndarray,
+ return_dict: bool = True,
+ ) -> Union[FlaxKarrasVeOutput, Tuple]:
+ """
+ Correct the predicted sample based on the output model_output of the network. TODO complete description
+
+ Args:
+ state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class.
+ model_output (`torch.FloatTensor` or `np.ndarray`): direct output from learned diffusion model.
+ sigma_hat (`float`): TODO
+ sigma_prev (`float`): TODO
+ sample_hat (`torch.FloatTensor` or `np.ndarray`): TODO
+ sample_prev (`torch.FloatTensor` or `np.ndarray`): TODO
+ derivative (`torch.FloatTensor` or `np.ndarray`): TODO
+ return_dict (`bool`): option for returning tuple rather than FlaxKarrasVeOutput class
+
+ Returns:
+ prev_sample (TODO): updated sample in the diffusion chain. derivative (TODO): TODO
+
+ """
+ pred_original_sample = sample_prev + sigma_prev * model_output
+ derivative_corr = (sample_prev - pred_original_sample) / sigma_prev
+ sample_prev = sample_hat + (sigma_prev - sigma_hat) * (0.5 * derivative + 0.5 * derivative_corr)
+
+ if not return_dict:
+ return (sample_prev, derivative, state)
+
+ return FlaxKarrasVeOutput(prev_sample=sample_prev, derivative=derivative, state=state)
+
+ def add_noise(self, original_samples, noise, timesteps):
+ raise NotImplementedError()
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_lms_discrete.py b/src/model/TextGen/diffusers/schedulers/scheduling_lms_discrete.py
new file mode 100644
index 0000000..373c373
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_lms_discrete.py
@@ -0,0 +1,273 @@
+# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from scipy import integrate
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->LMSDiscrete
+class LMSDiscreteSchedulerOutput(BaseOutput):
+ """
+ Output class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+ `pred_original_sample` can be used to preview progress or for guidance.
+ """
+
+ prev_sample: torch.FloatTensor
+ pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
+ """
+ Linear Multistep Scheduler for discrete beta schedules. Based on the original k-diffusion implementation by
+ Katherine Crowson:
+ https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear` or `scaled_linear`.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+
+ """
+
+ _compatible_classes = [
+ "DDIMScheduler",
+ "DDPMScheduler",
+ "PNDMScheduler",
+ "EulerDiscreteScheduler",
+ "EulerAncestralDiscreteScheduler",
+ "DPMSolverMultistepScheduler",
+ ]
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[np.ndarray] = None,
+ ):
+ if trained_betas is not None:
+ self.betas = torch.from_numpy(trained_betas)
+ elif beta_schedule == "linear":
+ self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = (
+ torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+ )
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+ self.sigmas = torch.from_numpy(sigmas)
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = self.sigmas.max()
+
+ # setable values
+ self.num_inference_steps = None
+ timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
+ self.timesteps = torch.from_numpy(timesteps)
+ self.derivatives = []
+ self.is_scale_input_called = False
+
+ def scale_model_input(
+ self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+ ) -> torch.FloatTensor:
+ """
+ Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm.
+
+ Args:
+ sample (`torch.FloatTensor`): input sample
+ timestep (`float` or `torch.FloatTensor`): the current timestep in the diffusion chain
+
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ if isinstance(timestep, torch.Tensor):
+ timestep = timestep.to(self.timesteps.device)
+ step_index = (self.timesteps == timestep).nonzero().item()
+ sigma = self.sigmas[step_index]
+ sample = sample / ((sigma**2 + 1) ** 0.5)
+ self.is_scale_input_called = True
+ return sample
+
+ def get_lms_coefficient(self, order, t, current_order):
+ """
+ Compute a linear multistep coefficient.
+
+ Args:
+ order (TODO):
+ t (TODO):
+ current_order (TODO):
+ """
+
+ def lms_derivative(tau):
+ prod = 1.0
+ for k in range(order):
+ if current_order == k:
+ continue
+ prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
+ return prod
+
+ integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
+
+ return integrated_coeff
+
+ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+ """
+ Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ device (`str` or `torch.device`, optional):
+ the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+ """
+ self.num_inference_steps = num_inference_steps
+
+ timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+ sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+ sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+
+ self.sigmas = torch.from_numpy(sigmas).to(device=device)
+ if str(device).startswith("mps"):
+ # mps does not support float64
+ self.timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32)
+ else:
+ self.timesteps = torch.from_numpy(timesteps).to(device=device)
+
+ self.derivatives = []
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: Union[float, torch.FloatTensor],
+ sample: torch.FloatTensor,
+ order: int = 4,
+ return_dict: bool = True,
+ ) -> Union[LMSDiscreteSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`float`): current timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ order: coefficient for multi-step inference.
+ return_dict (`bool`): option for returning tuple rather than LMSDiscreteSchedulerOutput class
+
+ Returns:
+ [`~schedulers.scheduling_utils.LMSDiscreteSchedulerOutput`] or `tuple`:
+ [`~schedulers.scheduling_utils.LMSDiscreteSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`.
+ When returning a tuple, the first element is the sample tensor.
+
+ """
+ if not self.is_scale_input_called:
+ warnings.warn(
+ "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+ "See `StableDiffusionPipeline` for a usage example."
+ )
+
+ if isinstance(timestep, torch.Tensor):
+ timestep = timestep.to(self.timesteps.device)
+ step_index = (self.timesteps == timestep).nonzero().item()
+ sigma = self.sigmas[step_index]
+
+ # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+ pred_original_sample = sample - sigma * model_output
+
+ # 2. Convert to an ODE derivative
+ derivative = (sample - pred_original_sample) / sigma
+ self.derivatives.append(derivative)
+ if len(self.derivatives) > order:
+ self.derivatives.pop(0)
+
+ # 3. Compute linear multistep coefficients
+ order = min(step_index + 1, order)
+ lms_coeffs = [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)]
+
+ # 4. Compute previous sample based on the derivatives path
+ prev_sample = sample + sum(
+ coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives))
+ )
+
+ if not return_dict:
+ return (prev_sample,)
+
+ return LMSDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+ def add_noise(
+ self,
+ original_samples: torch.FloatTensor,
+ noise: torch.FloatTensor,
+ timesteps: torch.FloatTensor,
+ ) -> torch.FloatTensor:
+ # Make sure sigmas and timesteps have the same device and dtype as original_samples
+ self.sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+ if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+ # mps does not support float64
+ self.timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+ timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+ else:
+ self.timesteps = self.timesteps.to(original_samples.device)
+ timesteps = timesteps.to(original_samples.device)
+
+ schedule_timesteps = self.timesteps
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+ sigma = self.sigmas[step_indices].flatten()
+ while len(sigma.shape) < len(original_samples.shape):
+ sigma = sigma.unsqueeze(-1)
+
+ noisy_samples = original_samples + noise * sigma
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_lms_discrete_flax.py b/src/model/TextGen/diffusers/schedulers/scheduling_lms_discrete_flax.py
new file mode 100644
index 0000000..20982d3
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_lms_discrete_flax.py
@@ -0,0 +1,215 @@
+# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+from scipy import integrate
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+
+
+@flax.struct.dataclass
+class LMSDiscreteSchedulerState:
+ # setable values
+ num_inference_steps: Optional[int] = None
+ timesteps: Optional[jnp.ndarray] = None
+ sigmas: Optional[jnp.ndarray] = None
+ derivatives: jnp.ndarray = jnp.array([])
+
+ @classmethod
+ def create(cls, num_train_timesteps: int, sigmas: jnp.ndarray):
+ return cls(timesteps=jnp.arange(0, num_train_timesteps)[::-1], sigmas=sigmas)
+
+
+@dataclass
+class FlaxLMSSchedulerOutput(FlaxSchedulerOutput):
+ state: LMSDiscreteSchedulerState
+
+
+class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
+ """
+ Linear Multistep Scheduler for discrete beta schedules. Based on the original k-diffusion implementation by
+ Katherine Crowson:
+ https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear` or `scaled_linear`.
+ trained_betas (`jnp.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ """
+
+ @property
+ def has_state(self):
+ return True
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[jnp.ndarray] = None,
+ ):
+ if trained_betas is not None:
+ self.betas = jnp.asarray(trained_betas)
+ elif beta_schedule == "linear":
+ self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = jnp.cumprod(self.alphas, axis=0)
+
+ def create_state(self):
+ self.state = LMSDiscreteSchedulerState.create(
+ num_train_timesteps=self.config.num_train_timesteps,
+ sigmas=((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5,
+ )
+
+ def get_lms_coefficient(self, state, order, t, current_order):
+ """
+ Compute a linear multistep coefficient.
+
+ Args:
+ order (TODO):
+ t (TODO):
+ current_order (TODO):
+ """
+
+ def lms_derivative(tau):
+ prod = 1.0
+ for k in range(order):
+ if current_order == k:
+ continue
+ prod *= (tau - state.sigmas[t - k]) / (state.sigmas[t - current_order] - state.sigmas[t - k])
+ return prod
+
+ integrated_coeff = integrate.quad(lms_derivative, state.sigmas[t], state.sigmas[t + 1], epsrel=1e-4)[0]
+
+ return integrated_coeff
+
+ def set_timesteps(
+ self, state: LMSDiscreteSchedulerState, num_inference_steps: int, shape: Tuple = ()
+ ) -> LMSDiscreteSchedulerState:
+ """
+ Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ state (`LMSDiscreteSchedulerState`):
+ the `FlaxLMSDiscreteScheduler` state data class instance.
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ """
+ timesteps = jnp.linspace(self.config.num_train_timesteps - 1, 0, num_inference_steps, dtype=jnp.float32)
+
+ low_idx = jnp.floor(timesteps).astype(int)
+ high_idx = jnp.ceil(timesteps).astype(int)
+ frac = jnp.mod(timesteps, 1.0)
+ sigmas = jnp.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+ sigmas = (1 - frac) * sigmas[low_idx] + frac * sigmas[high_idx]
+ sigmas = jnp.concatenate([sigmas, jnp.array([0.0])]).astype(jnp.float32)
+
+ return state.replace(
+ num_inference_steps=num_inference_steps,
+ timesteps=timesteps.astype(int),
+ derivatives=jnp.array([]),
+ sigmas=sigmas,
+ )
+
+ def step(
+ self,
+ state: LMSDiscreteSchedulerState,
+ model_output: jnp.ndarray,
+ timestep: int,
+ sample: jnp.ndarray,
+ order: int = 4,
+ return_dict: bool = True,
+ ) -> Union[FlaxLMSSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ state (`LMSDiscreteSchedulerState`): the `FlaxLMSDiscreteScheduler` state data class instance.
+ model_output (`jnp.ndarray`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`jnp.ndarray`):
+ current instance of sample being created by diffusion process.
+ order: coefficient for multi-step inference.
+ return_dict (`bool`): option for returning tuple rather than FlaxLMSSchedulerOutput class
+
+ Returns:
+ [`FlaxLMSSchedulerOutput`] or `tuple`: [`FlaxLMSSchedulerOutput`] if `return_dict` is True, otherwise a
+ `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ sigma = state.sigmas[timestep]
+
+ # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+ pred_original_sample = sample - sigma * model_output
+
+ # 2. Convert to an ODE derivative
+ derivative = (sample - pred_original_sample) / sigma
+ state = state.replace(derivatives=state.derivatives.append(derivative))
+ if len(state.derivatives) > order:
+ state = state.replace(derivatives=state.derivatives.pop(0))
+
+ # 3. Compute linear multistep coefficients
+ order = min(timestep + 1, order)
+ lms_coeffs = [self.get_lms_coefficient(state, order, timestep, curr_order) for curr_order in range(order)]
+
+ # 4. Compute previous sample based on the derivatives path
+ prev_sample = sample + sum(
+ coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(state.derivatives))
+ )
+
+ if not return_dict:
+ return (prev_sample, state)
+
+ return FlaxLMSSchedulerOutput(prev_sample=prev_sample, state=state)
+
+ def add_noise(
+ self,
+ state: LMSDiscreteSchedulerState,
+ original_samples: jnp.ndarray,
+ noise: jnp.ndarray,
+ timesteps: jnp.ndarray,
+ ) -> jnp.ndarray:
+ sigma = state.sigmas[timesteps].flatten()
+ sigma = broadcast_to_shape_from_left(sigma, noise.shape)
+
+ noisy_samples = original_samples + noise * sigma
+
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_pndm.py b/src/model/TextGen/diffusers/schedulers/scheduling_pndm.py
new file mode 100644
index 0000000..3e7c3a4
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_pndm.py
@@ -0,0 +1,420 @@
+# Copyright 2022 Zhejiang University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+import math
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import SchedulerMixin, SchedulerOutput
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+ (1-beta) over time from t = [0,1].
+
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+ to that part of the diffusion process.
+
+
+ Args:
+ num_diffusion_timesteps (`int`): the number of betas to produce.
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+
+ Returns:
+ betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+ """
+
+ def alpha_bar(time_step):
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return torch.tensor(betas, dtype=torch.float32)
+
+
+class PNDMScheduler(SchedulerMixin, ConfigMixin):
+ """
+ Pseudo numerical methods for diffusion models (PNDM) proposes using more advanced ODE integration techniques,
+ namely Runge-Kutta method and a linear multi-step method.
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ For more details, see the original paper: https://arxiv.org/abs/2202.09778
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ skip_prk_steps (`bool`):
+ allows the scheduler to skip the Runge-Kutta steps that are defined in the original paper as being required
+ before plms steps; defaults to `False`.
+ set_alpha_to_one (`bool`, default `False`):
+ each diffusion step uses the value of alphas product at that step and at the previous one. For the final
+ step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+ otherwise it uses the value of alpha at step 0.
+ steps_offset (`int`, default `0`):
+ an offset added to the inference steps. You can use a combination of `offset=1` and
+ `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
+ stable diffusion.
+
+ """
+
+ _compatible_classes = [
+ "DDIMScheduler",
+ "DDPMScheduler",
+ "LMSDiscreteScheduler",
+ "EulerDiscreteScheduler",
+ "EulerAncestralDiscreteScheduler",
+ "DPMSolverMultistepScheduler",
+ ]
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[np.ndarray] = None,
+ skip_prk_steps: bool = False,
+ set_alpha_to_one: bool = False,
+ steps_offset: int = 0,
+ ):
+ if trained_betas is not None:
+ self.betas = torch.from_numpy(trained_betas)
+ elif beta_schedule == "linear":
+ self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = (
+ torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+ )
+ elif beta_schedule == "squaredcos_cap_v2":
+ # Glide cosine schedule
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+ self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = 1.0
+
+ # For now we only support F-PNDM, i.e. the runge-kutta method
+ # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
+ # mainly at formula (9), (12), (13) and the Algorithm 2.
+ self.pndm_order = 4
+
+ # running values
+ self.cur_model_output = 0
+ self.counter = 0
+ self.cur_sample = None
+ self.ets = []
+
+ # setable values
+ self.num_inference_steps = None
+ self._timesteps = np.arange(0, num_train_timesteps)[::-1].copy()
+ self.prk_timesteps = None
+ self.plms_timesteps = None
+ self.timesteps = None
+
+ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None, **kwargs):
+ """
+ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ """
+
+ self.num_inference_steps = num_inference_steps
+ step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+ # creates integer timesteps by multiplying by ratio
+ # casting to int to avoid issues when num_inference_step is power of 3
+ self._timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()
+ self._timesteps += self.config.steps_offset
+
+ if self.config.skip_prk_steps:
+ # for some models like stable diffusion the prk steps can/should be skipped to
+ # produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation
+ # is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51
+ self.prk_timesteps = np.array([])
+ self.plms_timesteps = np.concatenate([self._timesteps[:-1], self._timesteps[-2:-1], self._timesteps[-1:]])[
+ ::-1
+ ].copy()
+ else:
+ prk_timesteps = np.array(self._timesteps[-self.pndm_order :]).repeat(2) + np.tile(
+ np.array([0, self.config.num_train_timesteps // num_inference_steps // 2]), self.pndm_order
+ )
+ self.prk_timesteps = (prk_timesteps[:-1].repeat(2)[1:-1])[::-1].copy()
+ self.plms_timesteps = self._timesteps[:-3][
+ ::-1
+ ].copy() # we copy to avoid having negative strides which are not supported by torch.from_numpy
+
+ timesteps = np.concatenate([self.prk_timesteps, self.plms_timesteps]).astype(np.int64)
+ self.timesteps = torch.from_numpy(timesteps).to(device)
+
+ self.ets = []
+ self.counter = 0
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ sample: torch.FloatTensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ This function calls `step_prk()` or `step_plms()` depending on the internal variable `counter`.
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
+
+ Returns:
+ [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+ [`~schedulers.scheduling_utils.SchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+ returning a tuple, the first element is the sample tensor.
+
+ """
+ if self.counter < len(self.prk_timesteps) and not self.config.skip_prk_steps:
+ return self.step_prk(model_output=model_output, timestep=timestep, sample=sample, return_dict=return_dict)
+ else:
+ return self.step_plms(model_output=model_output, timestep=timestep, sample=sample, return_dict=return_dict)
+
+ def step_prk(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ sample: torch.FloatTensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
+ """
+ Step function propagating the sample with the Runge-Kutta method. RK takes 4 forward passes to approximate the
+ solution to the differential equation.
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
+
+ Returns:
+ [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
+ True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ if self.num_inference_steps is None:
+ raise ValueError(
+ "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ diff_to_prev = 0 if self.counter % 2 else self.config.num_train_timesteps // self.num_inference_steps // 2
+ prev_timestep = timestep - diff_to_prev
+ timestep = self.prk_timesteps[self.counter // 4 * 4]
+
+ if self.counter % 4 == 0:
+ self.cur_model_output += 1 / 6 * model_output
+ self.ets.append(model_output)
+ self.cur_sample = sample
+ elif (self.counter - 1) % 4 == 0:
+ self.cur_model_output += 1 / 3 * model_output
+ elif (self.counter - 2) % 4 == 0:
+ self.cur_model_output += 1 / 3 * model_output
+ elif (self.counter - 3) % 4 == 0:
+ model_output = self.cur_model_output + 1 / 6 * model_output
+ self.cur_model_output = 0
+
+ # cur_sample should not be `None`
+ cur_sample = self.cur_sample if self.cur_sample is not None else sample
+
+ prev_sample, pred_original_sample = self._get_prev_sample(cur_sample, timestep, prev_timestep, model_output)
+ self.counter += 1
+
+ if not return_dict:
+ return (prev_sample,)
+
+ return SchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+ def step_plms(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ sample: torch.FloatTensor,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
+ """
+ Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
+ times to approximate the solution.
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
+
+ Returns:
+ [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
+ True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ if self.num_inference_steps is None:
+ raise ValueError(
+ "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ if not self.config.skip_prk_steps and len(self.ets) < 3:
+ raise ValueError(
+ f"{self.__class__} can only be run AFTER scheduler has been run "
+ "in 'prk' mode for at least 12 iterations "
+ "See: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py "
+ "for more information."
+ )
+
+ prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+ if self.counter != 1:
+ self.ets = self.ets[-3:]
+ self.ets.append(model_output)
+ else:
+ prev_timestep = timestep
+ timestep = timestep + self.config.num_train_timesteps // self.num_inference_steps
+
+ if len(self.ets) == 1 and self.counter == 0:
+ model_output = model_output
+ self.cur_sample = sample
+ elif len(self.ets) == 1 and self.counter == 1:
+ model_output = (model_output + self.ets[-1]) / 2
+ sample = self.cur_sample
+ self.cur_sample = None
+ elif len(self.ets) == 2:
+ model_output = (3 * self.ets[-1] - self.ets[-2]) / 2
+ elif len(self.ets) == 3:
+ model_output = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
+ else:
+ model_output = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4])
+
+ prev_sample, pred_original_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output)
+ self.counter += 1
+
+ if not return_dict:
+ return (prev_sample,)
+
+ return SchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+ """
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+
+ Args:
+ sample (`torch.FloatTensor`): input sample
+
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ return sample
+
+ def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
+ # See formula (9) of PNDM paper https://arxiv.org/pdf/2202.09778.pdf
+ # this function computes x_(t−δ) using the formula of (9)
+ # Note that x_t needs to be added to both sides of the equation
+
+ # Notation ( ->
+ # alpha_prod_t -> α_t
+ # alpha_prod_t_prev -> α_(t−δ)
+ # beta_prod_t -> (1 - α_t)
+ # beta_prod_t_prev -> (1 - α_(t−δ))
+ # sample -> x_t
+ # model_output -> e_θ(x_t, t)
+ # prev_sample -> x_(t−δ)
+ alpha_prod_t = self.alphas_cumprod[timestep]
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+ beta_prod_t = 1 - alpha_prod_t
+ beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+ # corresponds to (α_(t−δ) - α_t) divided by
+ # denominator of x_t in formula (9) and plus 1
+ # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
+ # sqrt(α_(t−δ)) / sqrt(α_t))
+ sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
+
+ # corresponds to denominator of e_θ(x_t, t) in formula (9)
+ model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
+ alpha_prod_t * beta_prod_t * alpha_prod_t_prev
+ ) ** (0.5)
+
+ # full formula (9)
+ prev_sample = (
+ sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
+ )
+
+ # add pred_original_sample
+ pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+ return prev_sample, pred_original_sample
+
+ def add_noise(
+ self,
+ original_samples: torch.FloatTensor,
+ noise: torch.FloatTensor,
+ timesteps: torch.IntTensor,
+ ) -> torch.Tensor:
+ # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+ self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+ timesteps = timesteps.to(original_samples.device)
+
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+ while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+ sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+ while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_pndm_flax.py b/src/model/TextGen/diffusers/schedulers/scheduling_pndm_flax.py
new file mode 100644
index 0000000..357ecfe
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_pndm_flax.py
@@ -0,0 +1,524 @@
+# Copyright 2022 Zhejiang University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps: int, max_beta=0.999) -> jnp.ndarray:
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+ (1-beta) over time from t = [0,1].
+
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+ to that part of the diffusion process.
+
+
+ Args:
+ num_diffusion_timesteps (`int`): the number of betas to produce.
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+
+ Returns:
+ betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
+ """
+
+ def alpha_bar(time_step):
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return jnp.array(betas, dtype=jnp.float32)
+
+
+@flax.struct.dataclass
+class PNDMSchedulerState:
+ # setable values
+ _timesteps: jnp.ndarray
+ num_inference_steps: Optional[int] = None
+ prk_timesteps: Optional[jnp.ndarray] = None
+ plms_timesteps: Optional[jnp.ndarray] = None
+ timesteps: Optional[jnp.ndarray] = None
+
+ # running values
+ cur_model_output: Optional[jnp.ndarray] = None
+ counter: int = 0
+ cur_sample: Optional[jnp.ndarray] = None
+ ets: jnp.ndarray = jnp.array([])
+
+ @classmethod
+ def create(cls, num_train_timesteps: int):
+ return cls(_timesteps=jnp.arange(0, num_train_timesteps)[::-1])
+
+
+@dataclass
+class FlaxPNDMSchedulerOutput(FlaxSchedulerOutput):
+ state: PNDMSchedulerState
+
+
+class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
+ """
+ Pseudo numerical methods for diffusion models (PNDM) proposes using more advanced ODE integration techniques,
+ namely Runge-Kutta method and a linear multi-step method.
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ For more details, see the original paper: https://arxiv.org/abs/2202.09778
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+ trained_betas (`jnp.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ skip_prk_steps (`bool`):
+ allows the scheduler to skip the Runge-Kutta steps that are defined in the original paper as being required
+ before plms steps; defaults to `False`.
+ set_alpha_to_one (`bool`, default `False`):
+ each diffusion step uses the value of alphas product at that step and at the previous one. For the final
+ step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+ otherwise it uses the value of alpha at step 0.
+ steps_offset (`int`, default `0`):
+ an offset added to the inference steps. You can use a combination of `offset=1` and
+ `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
+ stable diffusion.
+ """
+
+ @property
+ def has_state(self):
+ return True
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ trained_betas: Optional[jnp.ndarray] = None,
+ skip_prk_steps: bool = False,
+ set_alpha_to_one: bool = False,
+ steps_offset: int = 0,
+ ):
+ if trained_betas is not None:
+ self.betas = jnp.asarray(trained_betas)
+ elif beta_schedule == "linear":
+ self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2
+ elif beta_schedule == "squaredcos_cap_v2":
+ # Glide cosine schedule
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = jnp.cumprod(self.alphas, axis=0)
+
+ self.final_alpha_cumprod = jnp.array(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+ # For now we only support F-PNDM, i.e. the runge-kutta method
+ # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
+ # mainly at formula (9), (12), (13) and the Algorithm 2.
+ self.pndm_order = 4
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = 1.0
+
+ def create_state(self):
+ return PNDMSchedulerState.create(num_train_timesteps=self.config.num_train_timesteps)
+
+ def set_timesteps(self, state: PNDMSchedulerState, num_inference_steps: int, shape: Tuple) -> PNDMSchedulerState:
+ """
+ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ state (`PNDMSchedulerState`):
+ the `FlaxPNDMScheduler` state data class instance.
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ shape (`Tuple`):
+ the shape of the samples to be generated.
+ """
+ offset = self.config.steps_offset
+
+ step_ratio = self.config.num_train_timesteps // num_inference_steps
+ # creates integer timesteps by multiplying by ratio
+ # rounding to avoid issues when num_inference_step is power of 3
+ _timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round() + offset
+
+ state = state.replace(num_inference_steps=num_inference_steps, _timesteps=_timesteps)
+
+ if self.config.skip_prk_steps:
+ # for some models like stable diffusion the prk steps can/should be skipped to
+ # produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation
+ # is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51
+ state = state.replace(
+ prk_timesteps=jnp.array([]),
+ plms_timesteps=jnp.concatenate(
+ [state._timesteps[:-1], state._timesteps[-2:-1], state._timesteps[-1:]]
+ )[::-1],
+ )
+ else:
+ prk_timesteps = jnp.array(state._timesteps[-self.pndm_order :]).repeat(2) + jnp.tile(
+ jnp.array([0, self.config.num_train_timesteps // num_inference_steps // 2]), self.pndm_order
+ )
+
+ state = state.replace(
+ prk_timesteps=(prk_timesteps[:-1].repeat(2)[1:-1])[::-1],
+ plms_timesteps=state._timesteps[:-3][::-1],
+ )
+
+ return state.replace(
+ timesteps=jnp.concatenate([state.prk_timesteps, state.plms_timesteps]).astype(jnp.int32),
+ counter=0,
+ # Reserve space for the state variables
+ cur_model_output=jnp.zeros(shape),
+ cur_sample=jnp.zeros(shape),
+ ets=jnp.zeros((4,) + shape),
+ )
+
+ def scale_model_input(
+ self, state: PNDMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
+ ) -> jnp.ndarray:
+ """
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+
+ Args:
+ state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+ sample (`jnp.ndarray`): input sample
+ timestep (`int`, optional): current timestep
+
+ Returns:
+ `jnp.ndarray`: scaled input sample
+ """
+ return sample
+
+ def step(
+ self,
+ state: PNDMSchedulerState,
+ model_output: jnp.ndarray,
+ timestep: int,
+ sample: jnp.ndarray,
+ return_dict: bool = True,
+ ) -> Union[FlaxPNDMSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ This function calls `step_prk()` or `step_plms()` depending on the internal variable `counter`.
+
+ Args:
+ state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+ model_output (`jnp.ndarray`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`jnp.ndarray`):
+ current instance of sample being created by diffusion process.
+ return_dict (`bool`): option for returning tuple rather than FlaxPNDMSchedulerOutput class
+
+ Returns:
+ [`FlaxPNDMSchedulerOutput`] or `tuple`: [`FlaxPNDMSchedulerOutput`] if `return_dict` is True, otherwise a
+ `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ if self.config.skip_prk_steps:
+ prev_sample, state = self.step_plms(
+ state=state, model_output=model_output, timestep=timestep, sample=sample
+ )
+ else:
+ prev_sample, state = jax.lax.switch(
+ jnp.where(state.counter < len(state.prk_timesteps), 0, 1),
+ (self.step_prk, self.step_plms),
+ # Args to either branch
+ state,
+ model_output,
+ timestep,
+ sample,
+ )
+
+ if not return_dict:
+ return (prev_sample, state)
+
+ return FlaxPNDMSchedulerOutput(prev_sample=prev_sample, state=state)
+
+ def step_prk(
+ self,
+ state: PNDMSchedulerState,
+ model_output: jnp.ndarray,
+ timestep: int,
+ sample: jnp.ndarray,
+ ) -> Union[FlaxPNDMSchedulerOutput, Tuple]:
+ """
+ Step function propagating the sample with the Runge-Kutta method. RK takes 4 forward passes to approximate the
+ solution to the differential equation.
+
+ Args:
+ state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+ model_output (`jnp.ndarray`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`jnp.ndarray`):
+ current instance of sample being created by diffusion process.
+ return_dict (`bool`): option for returning tuple rather than FlaxPNDMSchedulerOutput class
+
+ Returns:
+ [`FlaxPNDMSchedulerOutput`] or `tuple`: [`FlaxPNDMSchedulerOutput`] if `return_dict` is True, otherwise a
+ `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ if state.num_inference_steps is None:
+ raise ValueError(
+ "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ diff_to_prev = jnp.where(
+ state.counter % 2, 0, self.config.num_train_timesteps // state.num_inference_steps // 2
+ )
+ prev_timestep = timestep - diff_to_prev
+ timestep = state.prk_timesteps[state.counter // 4 * 4]
+
+ def remainder_0(state: PNDMSchedulerState, model_output: jnp.ndarray, ets_at: int):
+ return (
+ state.replace(
+ cur_model_output=state.cur_model_output + 1 / 6 * model_output,
+ ets=state.ets.at[ets_at].set(model_output),
+ cur_sample=sample,
+ ),
+ model_output,
+ )
+
+ def remainder_1(state: PNDMSchedulerState, model_output: jnp.ndarray, ets_at: int):
+ return state.replace(cur_model_output=state.cur_model_output + 1 / 3 * model_output), model_output
+
+ def remainder_2(state: PNDMSchedulerState, model_output: jnp.ndarray, ets_at: int):
+ return state.replace(cur_model_output=state.cur_model_output + 1 / 3 * model_output), model_output
+
+ def remainder_3(state: PNDMSchedulerState, model_output: jnp.ndarray, ets_at: int):
+ model_output = state.cur_model_output + 1 / 6 * model_output
+ return state.replace(cur_model_output=jnp.zeros_like(state.cur_model_output)), model_output
+
+ state, model_output = jax.lax.switch(
+ state.counter % 4,
+ (remainder_0, remainder_1, remainder_2, remainder_3),
+ # Args to either branch
+ state,
+ model_output,
+ state.counter // 4,
+ )
+
+ cur_sample = state.cur_sample
+ prev_sample = self._get_prev_sample(cur_sample, timestep, prev_timestep, model_output)
+ state = state.replace(counter=state.counter + 1)
+
+ return (prev_sample, state)
+
+ def step_plms(
+ self,
+ state: PNDMSchedulerState,
+ model_output: jnp.ndarray,
+ timestep: int,
+ sample: jnp.ndarray,
+ ) -> Union[FlaxPNDMSchedulerOutput, Tuple]:
+ """
+ Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
+ times to approximate the solution.
+
+ Args:
+ state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+ model_output (`jnp.ndarray`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`jnp.ndarray`):
+ current instance of sample being created by diffusion process.
+ return_dict (`bool`): option for returning tuple rather than FlaxPNDMSchedulerOutput class
+
+ Returns:
+ [`FlaxPNDMSchedulerOutput`] or `tuple`: [`FlaxPNDMSchedulerOutput`] if `return_dict` is True, otherwise a
+ `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ if state.num_inference_steps is None:
+ raise ValueError(
+ "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ if not self.config.skip_prk_steps and len(state.ets) < 3:
+ raise ValueError(
+ f"{self.__class__} can only be run AFTER scheduler has been run "
+ "in 'prk' mode for at least 12 iterations "
+ "See: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py "
+ "for more information."
+ )
+
+ prev_timestep = timestep - self.config.num_train_timesteps // state.num_inference_steps
+ prev_timestep = jnp.where(prev_timestep > 0, prev_timestep, 0)
+
+ # Reference:
+ # if state.counter != 1:
+ # state.ets.append(model_output)
+ # else:
+ # prev_timestep = timestep
+ # timestep = timestep + self.config.num_train_timesteps // state.num_inference_steps
+
+ prev_timestep = jnp.where(state.counter == 1, timestep, prev_timestep)
+ timestep = jnp.where(
+ state.counter == 1, timestep + self.config.num_train_timesteps // state.num_inference_steps, timestep
+ )
+
+ # Reference:
+ # if len(state.ets) == 1 and state.counter == 0:
+ # model_output = model_output
+ # state.cur_sample = sample
+ # elif len(state.ets) == 1 and state.counter == 1:
+ # model_output = (model_output + state.ets[-1]) / 2
+ # sample = state.cur_sample
+ # state.cur_sample = None
+ # elif len(state.ets) == 2:
+ # model_output = (3 * state.ets[-1] - state.ets[-2]) / 2
+ # elif len(state.ets) == 3:
+ # model_output = (23 * state.ets[-1] - 16 * state.ets[-2] + 5 * state.ets[-3]) / 12
+ # else:
+ # model_output = (1 / 24) * (55 * state.ets[-1] - 59 * state.ets[-2] + 37 * state.ets[-3] - 9 * state.ets[-4])
+
+ def counter_0(state: PNDMSchedulerState):
+ ets = state.ets.at[0].set(model_output)
+ return state.replace(
+ ets=ets,
+ cur_sample=sample,
+ cur_model_output=jnp.array(model_output, dtype=jnp.float32),
+ )
+
+ def counter_1(state: PNDMSchedulerState):
+ return state.replace(
+ cur_model_output=(model_output + state.ets[0]) / 2,
+ )
+
+ def counter_2(state: PNDMSchedulerState):
+ ets = state.ets.at[1].set(model_output)
+ return state.replace(
+ ets=ets,
+ cur_model_output=(3 * ets[1] - ets[0]) / 2,
+ cur_sample=sample,
+ )
+
+ def counter_3(state: PNDMSchedulerState):
+ ets = state.ets.at[2].set(model_output)
+ return state.replace(
+ ets=ets,
+ cur_model_output=(23 * ets[2] - 16 * ets[1] + 5 * ets[0]) / 12,
+ cur_sample=sample,
+ )
+
+ def counter_other(state: PNDMSchedulerState):
+ ets = state.ets.at[3].set(model_output)
+ next_model_output = (1 / 24) * (55 * ets[3] - 59 * ets[2] + 37 * ets[1] - 9 * ets[0])
+
+ ets = ets.at[0].set(ets[1])
+ ets = ets.at[1].set(ets[2])
+ ets = ets.at[2].set(ets[3])
+
+ return state.replace(
+ ets=ets,
+ cur_model_output=next_model_output,
+ cur_sample=sample,
+ )
+
+ counter = jnp.clip(state.counter, 0, 4)
+ state = jax.lax.switch(
+ counter,
+ [counter_0, counter_1, counter_2, counter_3, counter_other],
+ state,
+ )
+
+ sample = state.cur_sample
+ model_output = state.cur_model_output
+ prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output)
+ state = state.replace(counter=state.counter + 1)
+
+ return (prev_sample, state)
+
+ def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
+ # See formula (9) of PNDM paper https://arxiv.org/pdf/2202.09778.pdf
+ # this function computes x_(t−δ) using the formula of (9)
+ # Note that x_t needs to be added to both sides of the equation
+
+ # Notation ( ->
+ # alpha_prod_t -> α_t
+ # alpha_prod_t_prev -> α_(t−δ)
+ # beta_prod_t -> (1 - α_t)
+ # beta_prod_t_prev -> (1 - α_(t−δ))
+ # sample -> x_t
+ # model_output -> e_θ(x_t, t)
+ # prev_sample -> x_(t−δ)
+ alpha_prod_t = self.alphas_cumprod[timestep]
+ alpha_prod_t_prev = jnp.where(prev_timestep >= 0, self.alphas_cumprod[prev_timestep], self.final_alpha_cumprod)
+ beta_prod_t = 1 - alpha_prod_t
+ beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+ # corresponds to (α_(t−δ) - α_t) divided by
+ # denominator of x_t in formula (9) and plus 1
+ # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
+ # sqrt(α_(t−δ)) / sqrt(α_t))
+ sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
+
+ # corresponds to denominator of e_θ(x_t, t) in formula (9)
+ model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
+ alpha_prod_t * beta_prod_t * alpha_prod_t_prev
+ ) ** (0.5)
+
+ # full formula (9)
+ prev_sample = (
+ sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
+ )
+
+ return prev_sample
+
+ def add_noise(
+ self,
+ original_samples: jnp.ndarray,
+ noise: jnp.ndarray,
+ timesteps: jnp.ndarray,
+ ) -> jnp.ndarray:
+ sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
+ sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+ sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
+
+ sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
+ sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+ sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
+
+ noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+ return noisy_samples
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_repaint.py b/src/model/TextGen/diffusers/schedulers/scheduling_repaint.py
new file mode 100644
index 0000000..1751f41
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_repaint.py
@@ -0,0 +1,322 @@
+# Copyright 2022 ETH Zurich Computer Vision Lab and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class RePaintSchedulerOutput(BaseOutput):
+ """
+ Output class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ The predicted denoised sample (x_{0}) based on the model output from
+ the current timestep. `pred_original_sample` can be used to preview progress or for guidance.
+ """
+
+ prev_sample: torch.FloatTensor
+ pred_original_sample: torch.FloatTensor
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+ """
+ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+ (1-beta) over time from t = [0,1].
+
+ Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+ to that part of the diffusion process.
+
+
+ Args:
+ num_diffusion_timesteps (`int`): the number of betas to produce.
+ max_beta (`float`): the maximum beta to use; use values lower than 1 to
+ prevent singularities.
+
+ Returns:
+ betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+ """
+
+ def alpha_bar(time_step):
+ return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+ betas = []
+ for i in range(num_diffusion_timesteps):
+ t1 = i / num_diffusion_timesteps
+ t2 = (i + 1) / num_diffusion_timesteps
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+ return torch.tensor(betas, dtype=torch.float32)
+
+
+class RePaintScheduler(SchedulerMixin, ConfigMixin):
+ """
+ RePaint is a schedule for DDPM inpainting inside a given mask.
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ For more details, see the original paper: https://arxiv.org/pdf/2201.09865.pdf
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ beta_start (`float`): the starting `beta` value of inference.
+ beta_end (`float`): the final `beta` value.
+ beta_schedule (`str`):
+ the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+ `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+ eta (`float`):
+ The weight of noise for added noise in a diffusion step. Its value is between 0.0 and 1.0 -0.0 is DDIM and
+ 1.0 is DDPM scheduler respectively.
+ trained_betas (`np.ndarray`, optional):
+ option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+ variance_type (`str`):
+ options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
+ `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
+ clip_sample (`bool`, default `True`):
+ option to clip predicted sample between -1 and 1 for numerical stability.
+
+ """
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 1000,
+ beta_start: float = 0.0001,
+ beta_end: float = 0.02,
+ beta_schedule: str = "linear",
+ eta: float = 0.0,
+ trained_betas: Optional[np.ndarray] = None,
+ clip_sample: bool = True,
+ ):
+ if trained_betas is not None:
+ self.betas = torch.from_numpy(trained_betas)
+ elif beta_schedule == "linear":
+ self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+ elif beta_schedule == "scaled_linear":
+ # this schedule is very specific to the latent diffusion model.
+ self.betas = (
+ torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+ )
+ elif beta_schedule == "squaredcos_cap_v2":
+ # Glide cosine schedule
+ self.betas = betas_for_alpha_bar(num_train_timesteps)
+ elif beta_schedule == "sigmoid":
+ # GeoDiff sigmoid schedule
+ betas = torch.linspace(-6, 6, num_train_timesteps)
+ self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
+ else:
+ raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+ self.alphas = 1.0 - self.betas
+ self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+ self.one = torch.tensor(1.0)
+
+ self.final_alpha_cumprod = torch.tensor(1.0)
+
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = 1.0
+
+ # setable values
+ self.num_inference_steps = None
+ self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+ self.eta = eta
+
+ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+ """
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+
+ Args:
+ sample (`torch.FloatTensor`): input sample
+ timestep (`int`, optional): current timestep
+
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ return sample
+
+ def set_timesteps(
+ self,
+ num_inference_steps: int,
+ jump_length: int = 10,
+ jump_n_sample: int = 10,
+ device: Union[str, torch.device] = None,
+ ):
+ num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps)
+ self.num_inference_steps = num_inference_steps
+
+ timesteps = []
+
+ jumps = {}
+ for j in range(0, num_inference_steps - jump_length, jump_length):
+ jumps[j] = jump_n_sample - 1
+
+ t = num_inference_steps
+ while t >= 1:
+ t = t - 1
+ timesteps.append(t)
+
+ if jumps.get(t, 0) > 0:
+ jumps[t] = jumps[t] - 1
+ for _ in range(jump_length):
+ t = t + 1
+ timesteps.append(t)
+
+ timesteps = np.array(timesteps) * (self.config.num_train_timesteps // self.num_inference_steps)
+ self.timesteps = torch.from_numpy(timesteps).to(device)
+
+ def _get_variance(self, t):
+ prev_timestep = t - self.config.num_train_timesteps // self.num_inference_steps
+
+ alpha_prod_t = self.alphas_cumprod[t]
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+ beta_prod_t = 1 - alpha_prod_t
+ beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+ # For t > 0, compute predicted variance βt (see formula (6) and (7) from
+ # https://arxiv.org/pdf/2006.11239.pdf) and sample from it to get
+ # previous sample x_{t-1} ~ N(pred_prev_sample, variance) == add
+ # variance to pred_sample
+ # Is equivalent to formula (16) in https://arxiv.org/pdf/2010.02502.pdf
+ # without eta.
+ # variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t]
+ variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+ return variance
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ sample: torch.FloatTensor,
+ original_image: torch.FloatTensor,
+ mask: torch.FloatTensor,
+ generator: Optional[torch.Generator] = None,
+ return_dict: bool = True,
+ ) -> Union[RePaintSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned
+ diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ original_image (`torch.FloatTensor`):
+ the original image to inpaint on.
+ mask (`torch.FloatTensor`):
+ the mask where 0.0 values define which part of the original image to inpaint (change).
+ generator (`torch.Generator`, *optional*): random number generator.
+ return_dict (`bool`): option for returning tuple rather than
+ DDPMSchedulerOutput class
+
+ Returns:
+ [`~schedulers.scheduling_utils.RePaintSchedulerOutput`] or `tuple`:
+ [`~schedulers.scheduling_utils.RePaintSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+ returning a tuple, the first element is the sample tensor.
+
+ """
+ t = timestep
+ prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+ # 1. compute alphas, betas
+ alpha_prod_t = self.alphas_cumprod[t]
+ alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+ beta_prod_t = 1 - alpha_prod_t
+
+ # 2. compute predicted original sample from predicted noise also called
+ # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+ pred_original_sample = (sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+
+ # 3. Clip "predicted x_0"
+ if self.config.clip_sample:
+ pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+ # We choose to follow RePaint Algorithm 1 to get x_{t-1}, however we
+ # substitute formula (7) in the algorithm coming from DDPM paper
+ # (formula (4) Algorithm 2 - Sampling) with formula (12) from DDIM paper.
+ # DDIM schedule gives the same results as DDPM with eta = 1.0
+ # Noise is being reused in 7. and 8., but no impact on quality has
+ # been observed.
+
+ # 5. Add noise
+ noise = torch.randn(
+ model_output.shape, dtype=model_output.dtype, generator=generator, device=model_output.device
+ )
+ std_dev_t = self.eta * self._get_variance(timestep) ** 0.5
+
+ variance = 0
+ if t > 0 and self.eta > 0:
+ variance = std_dev_t * noise
+
+ # 6. compute "direction pointing to x_t" of formula (12)
+ # from https://arxiv.org/pdf/2010.02502.pdf
+ pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** 0.5 * model_output
+
+ # 7. compute x_{t-1} of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+ prev_unknown_part = alpha_prod_t_prev**0.5 * pred_original_sample + pred_sample_direction + variance
+
+ # 8. Algorithm 1 Line 5 https://arxiv.org/pdf/2201.09865.pdf
+ prev_known_part = (alpha_prod_t**0.5) * original_image + ((1 - alpha_prod_t) ** 0.5) * noise
+
+ # 9. Algorithm 1 Line 8 https://arxiv.org/pdf/2201.09865.pdf
+ pred_prev_sample = mask * prev_known_part + (1.0 - mask) * prev_unknown_part
+
+ if not return_dict:
+ return (
+ pred_prev_sample,
+ pred_original_sample,
+ )
+
+ return RePaintSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+ def undo_step(self, sample, timestep, generator=None):
+ n = self.config.num_train_timesteps // self.num_inference_steps
+
+ for i in range(n):
+ beta = self.betas[timestep + i]
+ noise = torch.randn(sample.shape, generator=generator, device=sample.device)
+
+ # 10. Algorithm 1 Line 10 https://arxiv.org/pdf/2201.09865.pdf
+ sample = (1 - beta) ** 0.5 * sample + beta**0.5 * noise
+
+ return sample
+
+ def add_noise(
+ self,
+ original_samples: torch.FloatTensor,
+ noise: torch.FloatTensor,
+ timesteps: torch.IntTensor,
+ ) -> torch.FloatTensor:
+ raise NotImplementedError("Use `DDPMScheduler.add_noise()` to train for sampling with RePaint.")
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_sde_ve.py b/src/model/TextGen/diffusers/schedulers/scheduling_sde_ve.py
new file mode 100644
index 0000000..d31adbc
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_sde_ve.py
@@ -0,0 +1,264 @@
+# Copyright 2022 Google Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import SchedulerMixin, SchedulerOutput
+
+
+@dataclass
+class SdeVeOutput(BaseOutput):
+ """
+ Output class for the ScoreSdeVeScheduler's step function output.
+
+ Args:
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ prev_sample_mean (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Mean averaged `prev_sample`. Same as `prev_sample`, only mean-averaged over previous timesteps.
+ """
+
+ prev_sample: torch.FloatTensor
+ prev_sample_mean: torch.FloatTensor
+
+
+class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin):
+ """
+ The variance exploding stochastic differential equation (SDE) scheduler.
+
+ For more information, see the original paper: https://arxiv.org/abs/2011.13456
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ snr (`float`):
+ coefficient weighting the step from the model_output sample (from the network) to the random noise.
+ sigma_min (`float`):
+ initial noise scale for sigma sequence in sampling procedure. The minimum sigma should mirror the
+ distribution of the data.
+ sigma_max (`float`): maximum value used for the range of continuous timesteps passed into the model.
+ sampling_eps (`float`): the end value of sampling, where timesteps decrease progressively from 1 to
+ epsilon.
+ correct_steps (`int`): number of correction steps performed on a produced sample.
+ """
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 2000,
+ snr: float = 0.15,
+ sigma_min: float = 0.01,
+ sigma_max: float = 1348.0,
+ sampling_eps: float = 1e-5,
+ correct_steps: int = 1,
+ ):
+ # standard deviation of the initial noise distribution
+ self.init_noise_sigma = sigma_max
+
+ # setable values
+ self.timesteps = None
+
+ self.set_sigmas(num_train_timesteps, sigma_min, sigma_max, sampling_eps)
+
+ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+ """
+ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+ current timestep.
+
+ Args:
+ sample (`torch.FloatTensor`): input sample
+ timestep (`int`, optional): current timestep
+
+ Returns:
+ `torch.FloatTensor`: scaled input sample
+ """
+ return sample
+
+ def set_timesteps(
+ self, num_inference_steps: int, sampling_eps: float = None, device: Union[str, torch.device] = None
+ ):
+ """
+ Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ sampling_eps (`float`, optional): final timestep value (overrides value given at Scheduler instantiation).
+
+ """
+ sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
+
+ self.timesteps = torch.linspace(1, sampling_eps, num_inference_steps, device=device)
+
+ def set_sigmas(
+ self, num_inference_steps: int, sigma_min: float = None, sigma_max: float = None, sampling_eps: float = None
+ ):
+ """
+ Sets the noise scales used for the diffusion chain. Supporting function to be run before inference.
+
+ The sigmas control the weight of the `drift` and `diffusion` components of sample update.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ sigma_min (`float`, optional):
+ initial noise scale value (overrides value given at Scheduler instantiation).
+ sigma_max (`float`, optional): final noise scale value (overrides value given at Scheduler instantiation).
+ sampling_eps (`float`, optional): final timestep value (overrides value given at Scheduler instantiation).
+
+ """
+ sigma_min = sigma_min if sigma_min is not None else self.config.sigma_min
+ sigma_max = sigma_max if sigma_max is not None else self.config.sigma_max
+ sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
+ if self.timesteps is None:
+ self.set_timesteps(num_inference_steps, sampling_eps)
+
+ self.sigmas = sigma_min * (sigma_max / sigma_min) ** (self.timesteps / sampling_eps)
+ self.discrete_sigmas = torch.exp(torch.linspace(math.log(sigma_min), math.log(sigma_max), num_inference_steps))
+ self.sigmas = torch.tensor([sigma_min * (sigma_max / sigma_min) ** t for t in self.timesteps])
+
+ def get_adjacent_sigma(self, timesteps, t):
+ return torch.where(
+ timesteps == 0,
+ torch.zeros_like(t.to(timesteps.device)),
+ self.discrete_sigmas[timesteps - 1].to(timesteps.device),
+ )
+
+ def step_pred(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ sample: torch.FloatTensor,
+ generator: Optional[torch.Generator] = None,
+ return_dict: bool = True,
+ ) -> Union[SdeVeOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ generator: random number generator.
+ return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
+
+ Returns:
+ [`~schedulers.scheduling_sde_ve.SdeVeOutput`] or `tuple`: [`~schedulers.scheduling_sde_ve.SdeVeOutput`] if
+ `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ if self.timesteps is None:
+ raise ValueError(
+ "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ timestep = timestep * torch.ones(
+ sample.shape[0], device=sample.device
+ ) # torch.repeat_interleave(timestep, sample.shape[0])
+ timesteps = (timestep * (len(self.timesteps) - 1)).long()
+
+ # mps requires indices to be in the same device, so we use cpu as is the default with cuda
+ timesteps = timesteps.to(self.discrete_sigmas.device)
+
+ sigma = self.discrete_sigmas[timesteps].to(sample.device)
+ adjacent_sigma = self.get_adjacent_sigma(timesteps, timestep).to(sample.device)
+ drift = torch.zeros_like(sample)
+ diffusion = (sigma**2 - adjacent_sigma**2) ** 0.5
+
+ # equation 6 in the paper: the model_output modeled by the network is grad_x log pt(x)
+ # also equation 47 shows the analog from SDE models to ancestral sampling methods
+ diffusion = diffusion.flatten()
+ while len(diffusion.shape) < len(sample.shape):
+ diffusion = diffusion.unsqueeze(-1)
+ drift = drift - diffusion**2 * model_output
+
+ # equation 6: sample noise for the diffusion term of
+ noise = torch.randn(sample.shape, layout=sample.layout, generator=generator).to(sample.device)
+ prev_sample_mean = sample - drift # subtract because `dt` is a small negative timestep
+ # TODO is the variable diffusion the correct scaling term for the noise?
+ prev_sample = prev_sample_mean + diffusion * noise # add impact of diffusion field g
+
+ if not return_dict:
+ return (prev_sample, prev_sample_mean)
+
+ return SdeVeOutput(prev_sample=prev_sample, prev_sample_mean=prev_sample_mean)
+
+ def step_correct(
+ self,
+ model_output: torch.FloatTensor,
+ sample: torch.FloatTensor,
+ generator: Optional[torch.Generator] = None,
+ return_dict: bool = True,
+ ) -> Union[SchedulerOutput, Tuple]:
+ """
+ Correct the predicted sample based on the output model_output of the network. This is often run repeatedly
+ after making the prediction for the previous timestep.
+
+ Args:
+ model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+ sample (`torch.FloatTensor`):
+ current instance of sample being created by diffusion process.
+ generator: random number generator.
+ return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
+
+ Returns:
+ [`~schedulers.scheduling_sde_ve.SdeVeOutput`] or `tuple`: [`~schedulers.scheduling_sde_ve.SdeVeOutput`] if
+ `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+ """
+ if self.timesteps is None:
+ raise ValueError(
+ "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ # For small batch sizes, the paper "suggest replacing norm(z) with sqrt(d), where d is the dim. of z"
+ # sample noise for correction
+ noise = torch.randn(sample.shape, layout=sample.layout, generator=generator).to(sample.device)
+
+ # compute step size from the model_output, the noise, and the snr
+ grad_norm = torch.norm(model_output.reshape(model_output.shape[0], -1), dim=-1).mean()
+ noise_norm = torch.norm(noise.reshape(noise.shape[0], -1), dim=-1).mean()
+ step_size = (self.config.snr * noise_norm / grad_norm) ** 2 * 2
+ step_size = step_size * torch.ones(sample.shape[0]).to(sample.device)
+ # self.repeat_scalar(step_size, sample.shape[0])
+
+ # compute corrected sample: model_output term and noise term
+ step_size = step_size.flatten()
+ while len(step_size.shape) < len(sample.shape):
+ step_size = step_size.unsqueeze(-1)
+ prev_sample_mean = sample + step_size * model_output
+ prev_sample = prev_sample_mean + ((step_size * 2) ** 0.5) * noise
+
+ if not return_dict:
+ return (prev_sample,)
+
+ return SchedulerOutput(prev_sample=prev_sample)
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_sde_ve_flax.py b/src/model/TextGen/diffusers/schedulers/scheduling_sde_ve_flax.py
new file mode 100644
index 0000000..d3eaded
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_sde_ve_flax.py
@@ -0,0 +1,276 @@
+# Copyright 2022 Google Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+from jax import random
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+
+
+@flax.struct.dataclass
+class ScoreSdeVeSchedulerState:
+ # setable values
+ timesteps: Optional[jnp.ndarray] = None
+ discrete_sigmas: Optional[jnp.ndarray] = None
+ sigmas: Optional[jnp.ndarray] = None
+
+ @classmethod
+ def create(cls):
+ return cls()
+
+
+@dataclass
+class FlaxSdeVeOutput(FlaxSchedulerOutput):
+ """
+ Output class for the ScoreSdeVeScheduler's step function output.
+
+ Args:
+ state (`ScoreSdeVeSchedulerState`):
+ prev_sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ prev_sample_mean (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+ Mean averaged `prev_sample`. Same as `prev_sample`, only mean-averaged over previous timesteps.
+ """
+
+ state: ScoreSdeVeSchedulerState
+ prev_sample: jnp.ndarray
+ prev_sample_mean: Optional[jnp.ndarray] = None
+
+
+class FlaxScoreSdeVeScheduler(FlaxSchedulerMixin, ConfigMixin):
+ """
+ The variance exploding stochastic differential equation (SDE) scheduler.
+
+ For more information, see the original paper: https://arxiv.org/abs/2011.13456
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ Args:
+ num_train_timesteps (`int`): number of diffusion steps used to train the model.
+ snr (`float`):
+ coefficient weighting the step from the model_output sample (from the network) to the random noise.
+ sigma_min (`float`):
+ initial noise scale for sigma sequence in sampling procedure. The minimum sigma should mirror the
+ distribution of the data.
+ sigma_max (`float`): maximum value used for the range of continuous timesteps passed into the model.
+ sampling_eps (`float`): the end value of sampling, where timesteps decrease progressively from 1 to
+ epsilon.
+ correct_steps (`int`): number of correction steps performed on a produced sample.
+ """
+
+ @property
+ def has_state(self):
+ return True
+
+ @register_to_config
+ def __init__(
+ self,
+ num_train_timesteps: int = 2000,
+ snr: float = 0.15,
+ sigma_min: float = 0.01,
+ sigma_max: float = 1348.0,
+ sampling_eps: float = 1e-5,
+ correct_steps: int = 1,
+ ):
+ pass
+
+ def create_state(self):
+ state = ScoreSdeVeSchedulerState.create()
+ return self.set_sigmas(
+ state,
+ self.config.num_train_timesteps,
+ self.config.sigma_min,
+ self.config.sigma_max,
+ self.config.sampling_eps,
+ )
+
+ def set_timesteps(
+ self, state: ScoreSdeVeSchedulerState, num_inference_steps: int, shape: Tuple = (), sampling_eps: float = None
+ ) -> ScoreSdeVeSchedulerState:
+ """
+ Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ state (`ScoreSdeVeSchedulerState`): the `FlaxScoreSdeVeScheduler` state data class instance.
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ sampling_eps (`float`, optional): final timestep value (overrides value given at Scheduler instantiation).
+
+ """
+ sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
+
+ timesteps = jnp.linspace(1, sampling_eps, num_inference_steps)
+ return state.replace(timesteps=timesteps)
+
+ def set_sigmas(
+ self,
+ state: ScoreSdeVeSchedulerState,
+ num_inference_steps: int,
+ sigma_min: float = None,
+ sigma_max: float = None,
+ sampling_eps: float = None,
+ ) -> ScoreSdeVeSchedulerState:
+ """
+ Sets the noise scales used for the diffusion chain. Supporting function to be run before inference.
+
+ The sigmas control the weight of the `drift` and `diffusion` components of sample update.
+
+ Args:
+ state (`ScoreSdeVeSchedulerState`): the `FlaxScoreSdeVeScheduler` state data class instance.
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+ sigma_min (`float`, optional):
+ initial noise scale value (overrides value given at Scheduler instantiation).
+ sigma_max (`float`, optional): final noise scale value (overrides value given at Scheduler instantiation).
+ sampling_eps (`float`, optional): final timestep value (overrides value given at Scheduler instantiation).
+ """
+ sigma_min = sigma_min if sigma_min is not None else self.config.sigma_min
+ sigma_max = sigma_max if sigma_max is not None else self.config.sigma_max
+ sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
+ if state.timesteps is None:
+ state = self.set_timesteps(state, num_inference_steps, sampling_eps)
+
+ discrete_sigmas = jnp.exp(jnp.linspace(jnp.log(sigma_min), jnp.log(sigma_max), num_inference_steps))
+ sigmas = jnp.array([sigma_min * (sigma_max / sigma_min) ** t for t in state.timesteps])
+
+ return state.replace(discrete_sigmas=discrete_sigmas, sigmas=sigmas)
+
+ def get_adjacent_sigma(self, state, timesteps, t):
+ return jnp.where(timesteps == 0, jnp.zeros_like(t), state.discrete_sigmas[timesteps - 1])
+
+ def step_pred(
+ self,
+ state: ScoreSdeVeSchedulerState,
+ model_output: jnp.ndarray,
+ timestep: int,
+ sample: jnp.ndarray,
+ key: random.KeyArray,
+ return_dict: bool = True,
+ ) -> Union[FlaxSdeVeOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+ process from the learned model outputs (most often the predicted noise).
+
+ Args:
+ state (`ScoreSdeVeSchedulerState`): the `FlaxScoreSdeVeScheduler` state data class instance.
+ model_output (`jnp.ndarray`): direct output from learned diffusion model.
+ timestep (`int`): current discrete timestep in the diffusion chain.
+ sample (`jnp.ndarray`):
+ current instance of sample being created by diffusion process.
+ generator: random number generator.
+ return_dict (`bool`): option for returning tuple rather than FlaxSdeVeOutput class
+
+ Returns:
+ [`FlaxSdeVeOutput`] or `tuple`: [`FlaxSdeVeOutput`] if `return_dict` is True, otherwise a `tuple`. When
+ returning a tuple, the first element is the sample tensor.
+
+ """
+ if state.timesteps is None:
+ raise ValueError(
+ "`state.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ timestep = timestep * jnp.ones(
+ sample.shape[0],
+ )
+ timesteps = (timestep * (len(state.timesteps) - 1)).long()
+
+ sigma = state.discrete_sigmas[timesteps]
+ adjacent_sigma = self.get_adjacent_sigma(state, timesteps, timestep)
+ drift = jnp.zeros_like(sample)
+ diffusion = (sigma**2 - adjacent_sigma**2) ** 0.5
+
+ # equation 6 in the paper: the model_output modeled by the network is grad_x log pt(x)
+ # also equation 47 shows the analog from SDE models to ancestral sampling methods
+ diffusion = diffusion.flatten()
+ diffusion = broadcast_to_shape_from_left(diffusion, sample.shape)
+ drift = drift - diffusion**2 * model_output
+
+ # equation 6: sample noise for the diffusion term of
+ key = random.split(key, num=1)
+ noise = random.normal(key=key, shape=sample.shape)
+ prev_sample_mean = sample - drift # subtract because `dt` is a small negative timestep
+ # TODO is the variable diffusion the correct scaling term for the noise?
+ prev_sample = prev_sample_mean + diffusion * noise # add impact of diffusion field g
+
+ if not return_dict:
+ return (prev_sample, prev_sample_mean, state)
+
+ return FlaxSdeVeOutput(prev_sample=prev_sample, prev_sample_mean=prev_sample_mean, state=state)
+
+ def step_correct(
+ self,
+ state: ScoreSdeVeSchedulerState,
+ model_output: jnp.ndarray,
+ sample: jnp.ndarray,
+ key: random.KeyArray,
+ return_dict: bool = True,
+ ) -> Union[FlaxSdeVeOutput, Tuple]:
+ """
+ Correct the predicted sample based on the output model_output of the network. This is often run repeatedly
+ after making the prediction for the previous timestep.
+
+ Args:
+ state (`ScoreSdeVeSchedulerState`): the `FlaxScoreSdeVeScheduler` state data class instance.
+ model_output (`jnp.ndarray`): direct output from learned diffusion model.
+ sample (`jnp.ndarray`):
+ current instance of sample being created by diffusion process.
+ generator: random number generator.
+ return_dict (`bool`): option for returning tuple rather than FlaxSdeVeOutput class
+
+ Returns:
+ [`FlaxSdeVeOutput`] or `tuple`: [`FlaxSdeVeOutput`] if `return_dict` is True, otherwise a `tuple`. When
+ returning a tuple, the first element is the sample tensor.
+
+ """
+ if state.timesteps is None:
+ raise ValueError(
+ "`state.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ # For small batch sizes, the paper "suggest replacing norm(z) with sqrt(d), where d is the dim. of z"
+ # sample noise for correction
+ key = random.split(key, num=1)
+ noise = random.normal(key=key, shape=sample.shape)
+
+ # compute step size from the model_output, the noise, and the snr
+ grad_norm = jnp.linalg.norm(model_output)
+ noise_norm = jnp.linalg.norm(noise)
+ step_size = (self.config.snr * noise_norm / grad_norm) ** 2 * 2
+ step_size = step_size * jnp.ones(sample.shape[0])
+
+ # compute corrected sample: model_output term and noise term
+ step_size = step_size.flatten()
+ step_size = broadcast_to_shape_from_left(step_size, sample.shape)
+ prev_sample_mean = sample + step_size * model_output
+ prev_sample = prev_sample_mean + ((step_size * 2) ** 0.5) * noise
+
+ if not return_dict:
+ return (prev_sample, state)
+
+ return FlaxSdeVeOutput(prev_sample=prev_sample, state=state)
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_sde_vp.py b/src/model/TextGen/diffusers/schedulers/scheduling_sde_vp.py
new file mode 100644
index 0000000..a37a159
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_sde_vp.py
@@ -0,0 +1,87 @@
+# Copyright 2022 Google Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
+
+import math
+from typing import Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import SchedulerMixin
+
+
+class ScoreSdeVpScheduler(SchedulerMixin, ConfigMixin):
+ """
+ The variance preserving stochastic differential equation (SDE) scheduler.
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ For more information, see the original paper: https://arxiv.org/abs/2011.13456
+
+ UNDER CONSTRUCTION
+
+ """
+
+ @register_to_config
+ def __init__(self, num_train_timesteps=2000, beta_min=0.1, beta_max=20, sampling_eps=1e-3):
+ self.sigmas = None
+ self.discrete_sigmas = None
+ self.timesteps = None
+
+ def set_timesteps(self, num_inference_steps, device: Union[str, torch.device] = None):
+ self.timesteps = torch.linspace(1, self.config.sampling_eps, num_inference_steps, device=device)
+
+ def step_pred(self, score, x, t, generator=None):
+ if self.timesteps is None:
+ raise ValueError(
+ "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+ )
+
+ # TODO(Patrick) better comments + non-PyTorch
+ # postprocess model score
+ log_mean_coeff = (
+ -0.25 * t**2 * (self.config.beta_max - self.config.beta_min) - 0.5 * t * self.config.beta_min
+ )
+ std = torch.sqrt(1.0 - torch.exp(2.0 * log_mean_coeff))
+ std = std.flatten()
+ while len(std.shape) < len(score.shape):
+ std = std.unsqueeze(-1)
+ score = -score / std
+
+ # compute
+ dt = -1.0 / len(self.timesteps)
+
+ beta_t = self.config.beta_min + t * (self.config.beta_max - self.config.beta_min)
+ beta_t = beta_t.flatten()
+ while len(beta_t.shape) < len(x.shape):
+ beta_t = beta_t.unsqueeze(-1)
+ drift = -0.5 * beta_t * x
+
+ diffusion = torch.sqrt(beta_t)
+ drift = drift - diffusion**2 * score
+ x_mean = x + drift * dt
+
+ # add noise
+ noise = torch.randn(x.shape, layout=x.layout, generator=generator).to(x.device)
+ x = x_mean + diffusion * math.sqrt(-dt) * noise
+
+ return x, x_mean
+
+ def __len__(self):
+ return self.config.num_train_timesteps
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_utils.py b/src/model/TextGen/diffusers/schedulers/scheduling_utils.py
new file mode 100644
index 0000000..acfb4dd
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_utils.py
@@ -0,0 +1,43 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+
+from ..utils import BaseOutput
+
+
+SCHEDULER_CONFIG_NAME = "scheduler_config.json"
+
+
+@dataclass
+class SchedulerOutput(BaseOutput):
+ """
+ Base class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ """
+
+ prev_sample: torch.FloatTensor
+ pred_original_sample: Optional[torch.FloatTensor] = None
+
+class SchedulerMixin:
+ """
+ Mixin containing common functions for the schedulers.
+ """
+
+ config_name = SCHEDULER_CONFIG_NAME
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_utils_flax.py b/src/model/TextGen/diffusers/schedulers/scheduling_utils_flax.py
new file mode 100644
index 0000000..e545cfe
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_utils_flax.py
@@ -0,0 +1,49 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Tuple
+
+import jax.numpy as jnp
+
+from ..utils import BaseOutput
+
+
+SCHEDULER_CONFIG_NAME = "scheduler_config.json"
+
+
+@dataclass
+class FlaxSchedulerOutput(BaseOutput):
+ """
+ Base class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+ Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ """
+
+ prev_sample: jnp.ndarray
+
+
+class FlaxSchedulerMixin:
+ """
+ Mixin containing common functions for the schedulers.
+ """
+
+ config_name = SCHEDULER_CONFIG_NAME
+
+
+def broadcast_to_shape_from_left(x: jnp.ndarray, shape: Tuple[int]) -> jnp.ndarray:
+ assert len(shape) >= x.ndim
+ return jnp.broadcast_to(x.reshape(x.shape + (1,) * (len(shape) - x.ndim)), shape)
diff --git a/src/model/TextGen/diffusers/schedulers/scheduling_vq_diffusion.py b/src/model/TextGen/diffusers/schedulers/scheduling_vq_diffusion.py
new file mode 100644
index 0000000..dbe320d
--- /dev/null
+++ b/src/model/TextGen/diffusers/schedulers/scheduling_vq_diffusion.py
@@ -0,0 +1,494 @@
+# Copyright 2022 Microsoft and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class VQDiffusionSchedulerOutput(BaseOutput):
+ """
+ Output class for the scheduler's step function output.
+
+ Args:
+ prev_sample (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+ Computed sample x_{t-1} of previous timestep. `prev_sample` should be used as next model input in the
+ denoising loop.
+ """
+
+ prev_sample: torch.LongTensor
+
+
+def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.FloatTensor:
+ """
+ Convert batch of vector of class indices into batch of log onehot vectors
+
+ Args:
+ x (`torch.LongTensor` of shape `(batch size, vector length)`):
+ Batch of class indices
+
+ num_classes (`int`):
+ number of classes to be used for the onehot vectors
+
+ Returns:
+ `torch.FloatTensor` of shape `(batch size, num classes, vector length)`:
+ Log onehot vectors
+ """
+ x_onehot = F.one_hot(x, num_classes)
+ x_onehot = x_onehot.permute(0, 2, 1)
+ log_x = torch.log(x_onehot.float().clamp(min=1e-30))
+ return log_x
+
+
+def gumbel_noised(logits: torch.FloatTensor, generator: Optional[torch.Generator]) -> torch.FloatTensor:
+ """
+ Apply gumbel noise to `logits`
+ """
+ uniform = torch.rand(logits.shape, device=logits.device, generator=generator)
+ gumbel_noise = -torch.log(-torch.log(uniform + 1e-30) + 1e-30)
+ noised = gumbel_noise + logits
+ return noised
+
+
+def alpha_schedules(num_diffusion_timesteps: int, alpha_cum_start=0.99999, alpha_cum_end=0.000009):
+ """
+ Cumulative and non-cumulative alpha schedules.
+
+ See section 4.1.
+ """
+ att = (
+ np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (alpha_cum_end - alpha_cum_start)
+ + alpha_cum_start
+ )
+ att = np.concatenate(([1], att))
+ at = att[1:] / att[:-1]
+ att = np.concatenate((att[1:], [1]))
+ return at, att
+
+
+def gamma_schedules(num_diffusion_timesteps: int, gamma_cum_start=0.000009, gamma_cum_end=0.99999):
+ """
+ Cumulative and non-cumulative gamma schedules.
+
+ See section 4.1.
+ """
+ ctt = (
+ np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (gamma_cum_end - gamma_cum_start)
+ + gamma_cum_start
+ )
+ ctt = np.concatenate(([0], ctt))
+ one_minus_ctt = 1 - ctt
+ one_minus_ct = one_minus_ctt[1:] / one_minus_ctt[:-1]
+ ct = 1 - one_minus_ct
+ ctt = np.concatenate((ctt[1:], [0]))
+ return ct, ctt
+
+
+class VQDiffusionScheduler(SchedulerMixin, ConfigMixin):
+ """
+ The VQ-diffusion transformer outputs predicted probabilities of the initial unnoised image.
+
+ The VQ-diffusion scheduler converts the transformer's output into a sample for the unnoised image at the previous
+ diffusion timestep.
+
+ [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+ function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+ [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+ [`~ConfigMixin.from_config`] functions.
+
+ For more details, see the original paper: https://arxiv.org/abs/2111.14822
+
+ Args:
+ num_vec_classes (`int`):
+ The number of classes of the vector embeddings of the latent pixels. Includes the class for the masked
+ latent pixel.
+
+ num_train_timesteps (`int`):
+ Number of diffusion steps used to train the model.
+
+ alpha_cum_start (`float`):
+ The starting cumulative alpha value.
+
+ alpha_cum_end (`float`):
+ The ending cumulative alpha value.
+
+ gamma_cum_start (`float`):
+ The starting cumulative gamma value.
+
+ gamma_cum_end (`float`):
+ The ending cumulative gamma value.
+ """
+
+ @register_to_config
+ def __init__(
+ self,
+ num_vec_classes: int,
+ num_train_timesteps: int = 100,
+ alpha_cum_start: float = 0.99999,
+ alpha_cum_end: float = 0.000009,
+ gamma_cum_start: float = 0.000009,
+ gamma_cum_end: float = 0.99999,
+ ):
+ self.num_embed = num_vec_classes
+
+ # By convention, the index for the mask class is the last class index
+ self.mask_class = self.num_embed - 1
+
+ at, att = alpha_schedules(num_train_timesteps, alpha_cum_start=alpha_cum_start, alpha_cum_end=alpha_cum_end)
+ ct, ctt = gamma_schedules(num_train_timesteps, gamma_cum_start=gamma_cum_start, gamma_cum_end=gamma_cum_end)
+
+ num_non_mask_classes = self.num_embed - 1
+ bt = (1 - at - ct) / num_non_mask_classes
+ btt = (1 - att - ctt) / num_non_mask_classes
+
+ at = torch.tensor(at.astype("float64"))
+ bt = torch.tensor(bt.astype("float64"))
+ ct = torch.tensor(ct.astype("float64"))
+ log_at = torch.log(at)
+ log_bt = torch.log(bt)
+ log_ct = torch.log(ct)
+
+ att = torch.tensor(att.astype("float64"))
+ btt = torch.tensor(btt.astype("float64"))
+ ctt = torch.tensor(ctt.astype("float64"))
+ log_cumprod_at = torch.log(att)
+ log_cumprod_bt = torch.log(btt)
+ log_cumprod_ct = torch.log(ctt)
+
+ self.log_at = log_at.float()
+ self.log_bt = log_bt.float()
+ self.log_ct = log_ct.float()
+ self.log_cumprod_at = log_cumprod_at.float()
+ self.log_cumprod_bt = log_cumprod_bt.float()
+ self.log_cumprod_ct = log_cumprod_ct.float()
+
+ # setable values
+ self.num_inference_steps = None
+ self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+ """
+ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+ Args:
+ num_inference_steps (`int`):
+ the number of diffusion steps used when generating samples with a pre-trained model.
+
+ device (`str` or `torch.device`):
+ device to place the timesteps and the diffusion process parameters (alpha, beta, gamma) on.
+ """
+ self.num_inference_steps = num_inference_steps
+ timesteps = np.arange(0, self.num_inference_steps)[::-1].copy()
+ self.timesteps = torch.from_numpy(timesteps).to(device)
+
+ self.log_at = self.log_at.to(device)
+ self.log_bt = self.log_bt.to(device)
+ self.log_ct = self.log_ct.to(device)
+ self.log_cumprod_at = self.log_cumprod_at.to(device)
+ self.log_cumprod_bt = self.log_cumprod_bt.to(device)
+ self.log_cumprod_ct = self.log_cumprod_ct.to(device)
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: torch.long,
+ sample: torch.LongTensor,
+ generator: Optional[torch.Generator] = None,
+ return_dict: bool = True,
+ ) -> Union[VQDiffusionSchedulerOutput, Tuple]:
+ """
+ Predict the sample at the previous timestep via the reverse transition distribution i.e. Equation (11). See the
+ docstring for `self.q_posterior` for more in depth docs on how Equation (11) is computed.
+
+ Args:
+ log_p_x_0: (`torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`):
+ The log probabilities for the predicted classes of the initial latent pixels. Does not include a
+ prediction for the masked class as the initial unnoised image cannot be masked.
+
+ t (`torch.long`):
+ The timestep that determines which transition matrices are used.
+
+ x_t: (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+ The classes of each latent pixel at time `t`
+
+ generator: (`torch.Generator` or None):
+ RNG for the noise applied to p(x_{t-1} | x_t) before it is sampled from.
+
+ return_dict (`bool`):
+ option for returning tuple rather than VQDiffusionSchedulerOutput class
+
+ Returns:
+ [`~schedulers.scheduling_utils.VQDiffusionSchedulerOutput`] or `tuple`:
+ [`~schedulers.scheduling_utils.VQDiffusionSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`.
+ When returning a tuple, the first element is the sample tensor.
+ """
+ if timestep == 0:
+ log_p_x_t_min_1 = model_output
+ else:
+ log_p_x_t_min_1 = self.q_posterior(model_output, sample, timestep)
+
+ log_p_x_t_min_1 = gumbel_noised(log_p_x_t_min_1, generator)
+
+ x_t_min_1 = log_p_x_t_min_1.argmax(dim=1)
+
+ if not return_dict:
+ return (x_t_min_1,)
+
+ return VQDiffusionSchedulerOutput(prev_sample=x_t_min_1)
+
+ def q_posterior(self, log_p_x_0, x_t, t):
+ """
+ Calculates the log probabilities for the predicted classes of the image at timestep `t-1`. I.e. Equation (11).
+
+ Instead of directly computing equation (11), we use Equation (5) to restate Equation (11) in terms of only
+ forward probabilities.
+
+ Equation (11) stated in terms of forward probabilities via Equation (5):
+
+ Where:
+ - the sum is over x_0 = {C_0 ... C_{k-1}} (classes for x_0)
+
+ p(x_{t-1} | x_t) = sum( q(x_t | x_{t-1}) * q(x_{t-1} | x_0) * p(x_0) / q(x_t | x_0) )
+
+ Args:
+ log_p_x_0: (`torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`):
+ The log probabilities for the predicted classes of the initial latent pixels. Does not include a
+ prediction for the masked class as the initial unnoised image cannot be masked.
+
+ x_t: (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+ The classes of each latent pixel at time `t`
+
+ t (torch.Long):
+ The timestep that determines which transition matrix is used.
+
+ Returns:
+ `torch.FloatTensor` of shape `(batch size, num classes, num latent pixels)`:
+ The log probabilities for the predicted classes of the image at timestep `t-1`. I.e. Equation (11).
+ """
+ log_onehot_x_t = index_to_log_onehot(x_t, self.num_embed)
+
+ log_q_x_t_given_x_0 = self.log_Q_t_transitioning_to_known_class(
+ t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=True
+ )
+
+ log_q_t_given_x_t_min_1 = self.log_Q_t_transitioning_to_known_class(
+ t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=False
+ )
+
+ # p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) ... p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0)
+ # . . .
+ # . . .
+ # . . .
+ # p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) ... p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})
+ q = log_p_x_0 - log_q_x_t_given_x_0
+
+ # sum_0 = p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) + ... + p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}), ... ,
+ # sum_n = p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) + ... + p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})
+ q_log_sum_exp = torch.logsumexp(q, dim=1, keepdim=True)
+
+ # p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0 ... p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n
+ # . . .
+ # . . .
+ # . . .
+ # p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0 ... p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n
+ q = q - q_log_sum_exp
+
+ # (p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1} ... (p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}
+ # . . .
+ # . . .
+ # . . .
+ # (p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1} ... (p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}
+ # c_cumulative_{t-1} ... c_cumulative_{t-1}
+ q = self.apply_cumulative_transitions(q, t - 1)
+
+ # ((p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_0) * sum_0 ... ((p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_0) * sum_n
+ # . . .
+ # . . .
+ # . . .
+ # ((p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_{k-1}) * sum_0 ... ((p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_{k-1}) * sum_n
+ # c_cumulative_{t-1} * q(x_t | x_{t-1}=C_k) * sum_0 ... c_cumulative_{t-1} * q(x_t | x_{t-1}=C_k) * sum_0
+ log_p_x_t_min_1 = q + log_q_t_given_x_t_min_1 + q_log_sum_exp
+
+ # For each column, there are two possible cases.
+ #
+ # Where:
+ # - sum(p_n(x_0))) is summing over all classes for x_0
+ # - C_i is the class transitioning from (not to be confused with c_t and c_cumulative_t being used for gamma's)
+ # - C_j is the class transitioning to
+ #
+ # 1. x_t is masked i.e. x_t = c_k
+ #
+ # Simplifying the expression, the column vector is:
+ # .
+ # .
+ # .
+ # (c_t / c_cumulative_t) * (a_cumulative_{t-1} * p_n(x_0 = C_i | x_t) + b_cumulative_{t-1} * sum(p_n(x_0)))
+ # .
+ # .
+ # .
+ # (c_cumulative_{t-1} / c_cumulative_t) * sum(p_n(x_0))
+ #
+ # From equation (11) stated in terms of forward probabilities, the last row is trivially verified.
+ #
+ # For the other rows, we can state the equation as ...
+ #
+ # (c_t / c_cumulative_t) * [b_cumulative_{t-1} * p(x_0=c_0) + ... + (a_cumulative_{t-1} + b_cumulative_{t-1}) * p(x_0=C_i) + ... + b_cumulative_{k-1} * p(x_0=c_{k-1})]
+ #
+ # This verifies the other rows.
+ #
+ # 2. x_t is not masked
+ #
+ # Simplifying the expression, there are two cases for the rows of the column vector, where C_j = C_i and where C_j != C_i:
+ # .
+ # .
+ # .
+ # C_j != C_i: b_t * ((b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_0) + ... + ((a_cumulative_{t-1} + b_cumulative_{t-1}) / b_cumulative_t) * p_n(x_0 = C_i) + ... + (b_cumulative_{t-1} / (a_cumulative_t + b_cumulative_t)) * p_n(c_0=C_j) + ... + (b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_{k-1}))
+ # .
+ # .
+ # .
+ # C_j = C_i: (a_t + b_t) * ((b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_0) + ... + ((a_cumulative_{t-1} + b_cumulative_{t-1}) / (a_cumulative_t + b_cumulative_t)) * p_n(x_0 = C_i = C_j) + ... + (b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_{k-1}))
+ # .
+ # .
+ # .
+ # 0
+ #
+ # The last row is trivially verified. The other rows can be verified by directly expanding equation (11) stated in terms of forward probabilities.
+ return log_p_x_t_min_1
+
+ def log_Q_t_transitioning_to_known_class(
+ self, *, t: torch.int, x_t: torch.LongTensor, log_onehot_x_t: torch.FloatTensor, cumulative: bool
+ ):
+ """
+ Returns the log probabilities of the rows from the (cumulative or non-cumulative) transition matrix for each
+ latent pixel in `x_t`.
+
+ See equation (7) for the complete non-cumulative transition matrix. The complete cumulative transition matrix
+ is the same structure except the parameters (alpha, beta, gamma) are the cumulative analogs.
+
+ Args:
+ t (torch.Long):
+ The timestep that determines which transition matrix is used.
+
+ x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+ The classes of each latent pixel at time `t`.
+
+ log_onehot_x_t (`torch.FloatTensor` of shape `(batch size, num classes, num latent pixels)`):
+ The log one-hot vectors of `x_t`
+
+ cumulative (`bool`):
+ If cumulative is `False`, we use the single step transition matrix `t-1`->`t`. If cumulative is `True`,
+ we use the cumulative transition matrix `0`->`t`.
+
+ Returns:
+ `torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`:
+ Each _column_ of the returned matrix is a _row_ of log probabilities of the complete probability
+ transition matrix.
+
+ When non cumulative, returns `self.num_classes - 1` rows because the initial latent pixel cannot be
+ masked.
+
+ Where:
+ - `q_n` is the probability distribution for the forward process of the `n`th latent pixel.
+ - C_0 is a class of a latent pixel embedding
+ - C_k is the class of the masked latent pixel
+
+ non-cumulative result (omitting logarithms):
+ ```
+ q_0(x_t | x_{t-1} = C_0) ... q_n(x_t | x_{t-1} = C_0)
+ . . .
+ . . .
+ . . .
+ q_0(x_t | x_{t-1} = C_k) ... q_n(x_t | x_{t-1} = C_k)
+ ```
+
+ cumulative result (omitting logarithms):
+ ```
+ q_0_cumulative(x_t | x_0 = C_0) ... q_n_cumulative(x_t | x_0 = C_0)
+ . . .
+ . . .
+ . . .
+ q_0_cumulative(x_t | x_0 = C_{k-1}) ... q_n_cumulative(x_t | x_0 = C_{k-1})
+ ```
+ """
+ if cumulative:
+ a = self.log_cumprod_at[t]
+ b = self.log_cumprod_bt[t]
+ c = self.log_cumprod_ct[t]
+ else:
+ a = self.log_at[t]
+ b = self.log_bt[t]
+ c = self.log_ct[t]
+
+ if not cumulative:
+ # The values in the onehot vector can also be used as the logprobs for transitioning
+ # from masked latent pixels. If we are not calculating the cumulative transitions,
+ # we need to save these vectors to be re-appended to the final matrix so the values
+ # aren't overwritten.
+ #
+ # `P(x_t!=mask|x_{t-1=mask}) = 0` and 0 will be the value of the last row of the onehot vector
+ # if x_t is not masked
+ #
+ # `P(x_t=mask|x_{t-1=mask}) = 1` and 1 will be the value of the last row of the onehot vector
+ # if x_t is masked
+ log_onehot_x_t_transitioning_from_masked = log_onehot_x_t[:, -1, :].unsqueeze(1)
+
+ # `index_to_log_onehot` will add onehot vectors for masked pixels,
+ # so the default one hot matrix has one too many rows. See the doc string
+ # for an explanation of the dimensionality of the returned matrix.
+ log_onehot_x_t = log_onehot_x_t[:, :-1, :]
+
+ # this is a cheeky trick to produce the transition probabilities using log one-hot vectors.
+ #
+ # Don't worry about what values this sets in the columns that mark transitions
+ # to masked latent pixels. They are overwrote later with the `mask_class_mask`.
+ #
+ # Looking at the below logspace formula in non-logspace, each value will evaluate to either
+ # `1 * a + b = a + b` where `log_Q_t` has the one hot value in the column
+ # or
+ # `0 * a + b = b` where `log_Q_t` has the 0 values in the column.
+ #
+ # See equation 7 for more details.
+ log_Q_t = (log_onehot_x_t + a).logaddexp(b)
+
+ # The whole column of each masked pixel is `c`
+ mask_class_mask = x_t == self.mask_class
+ mask_class_mask = mask_class_mask.unsqueeze(1).expand(-1, self.num_embed - 1, -1)
+ log_Q_t[mask_class_mask] = c
+
+ if not cumulative:
+ log_Q_t = torch.cat((log_Q_t, log_onehot_x_t_transitioning_from_masked), dim=1)
+
+ return log_Q_t
+
+ def apply_cumulative_transitions(self, q, t):
+ bsz = q.shape[0]
+ a = self.log_cumprod_at[t]
+ b = self.log_cumprod_bt[t]
+ c = self.log_cumprod_ct[t]
+
+ num_latent_pixels = q.shape[2]
+ c = c.expand(bsz, 1, num_latent_pixels)
+
+ q = (q + a).logaddexp(b)
+ q = torch.cat((q, c), dim=1)
+
+ return q
diff --git a/src/model/TextGen/diffusers/training_utils.py b/src/model/TextGen/diffusers/training_utils.py
new file mode 100644
index 0000000..fa16941
--- /dev/null
+++ b/src/model/TextGen/diffusers/training_utils.py
@@ -0,0 +1,125 @@
+import copy
+import os
+import random
+
+import numpy as np
+import torch
+
+
+def enable_full_determinism(seed: int):
+ """
+ Helper function for reproducible behavior during distributed training. See
+ - https://pytorch.org/docs/stable/notes/randomness.html for pytorch
+ """
+ # set seed first
+ set_seed(seed)
+
+ # Enable PyTorch deterministic mode. This potentially requires either the environment
+ # variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set,
+ # depending on the CUDA version, so we set them both here
+ os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+ os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+ torch.use_deterministic_algorithms(True)
+
+ # Enable CUDNN deterministic mode
+ torch.backends.cudnn.deterministic = True
+ torch.backends.cudnn.benchmark = False
+
+
+def set_seed(seed: int):
+ """
+ Args:
+ Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
+ seed (`int`): The seed to set.
+ """
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ torch.cuda.manual_seed_all(seed)
+ # ^^ safe to call this function even if cuda is not available
+
+
+class EMAModel:
+ """
+ Exponential Moving Average of models weights
+ """
+
+ def __init__(
+ self,
+ model,
+ update_after_step=0,
+ inv_gamma=1.0,
+ power=2 / 3,
+ min_value=0.0,
+ max_value=0.9999,
+ device=None,
+ ):
+ """
+ @crowsonkb's notes on EMA Warmup:
+ If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
+ to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
+ gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
+ at 215.4k steps).
+ Args:
+ inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
+ power (float): Exponential factor of EMA warmup. Default: 2/3.
+ min_value (float): The minimum EMA decay rate. Default: 0.
+ """
+
+ self.averaged_model = copy.deepcopy(model).eval()
+ self.averaged_model.requires_grad_(False)
+
+ self.update_after_step = update_after_step
+ self.inv_gamma = inv_gamma
+ self.power = power
+ self.min_value = min_value
+ self.max_value = max_value
+
+ if device is not None:
+ self.averaged_model = self.averaged_model.to(device=device)
+
+ self.decay = 0.0
+ self.optimization_step = 0
+
+ def get_decay(self, optimization_step):
+ """
+ Compute the decay factor for the exponential moving average.
+ """
+ step = max(0, optimization_step - self.update_after_step - 1)
+ value = 1 - (1 + step / self.inv_gamma) ** -self.power
+
+ if step <= 0:
+ return 0.0
+
+ return max(self.min_value, min(value, self.max_value))
+
+ @torch.no_grad()
+ def step(self, new_model):
+ ema_state_dict = {}
+ ema_params = self.averaged_model.state_dict()
+
+ self.decay = self.get_decay(self.optimization_step)
+
+ for key, param in new_model.named_parameters():
+ if isinstance(param, dict):
+ continue
+ try:
+ ema_param = ema_params[key]
+ except KeyError:
+ ema_param = param.float().clone() if param.ndim == 1 else copy.deepcopy(param)
+ ema_params[key] = ema_param
+
+ if not param.requires_grad:
+ ema_params[key].copy_(param.to(dtype=ema_param.dtype).data)
+ ema_param = ema_params[key]
+ else:
+ ema_param.mul_(self.decay)
+ ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=1 - self.decay)
+
+ ema_state_dict[key] = ema_param
+
+ for key, param in new_model.named_buffers():
+ ema_state_dict[key] = param
+
+ self.averaged_model.load_state_dict(ema_state_dict, strict=False)
+ self.optimization_step += 1
diff --git a/src/model/TextGen/diffusers/utils/__init__.py b/src/model/TextGen/diffusers/utils/__init__.py
new file mode 100644
index 0000000..a00e1f4
--- /dev/null
+++ b/src/model/TextGen/diffusers/utils/__init__.py
@@ -0,0 +1,74 @@
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+from .deprecation_utils import deprecate
+from .import_utils import (
+ ENV_VARS_TRUE_AND_AUTO_VALUES,
+ ENV_VARS_TRUE_VALUES,
+ USE_JAX,
+ USE_TF,
+ USE_TORCH,
+ DummyObject,
+ is_accelerate_available,
+ is_flax_available,
+ is_inflect_available,
+ is_modelcards_available,
+ is_onnx_available,
+ is_scipy_available,
+ is_tf_available,
+ is_torch_available,
+ is_torch_version,
+ is_transformers_available,
+ is_unidecode_available,
+ requires_backends,
+)
+from .logging import get_logger
+from .outputs import BaseOutput
+
+
+if is_torch_available():
+ from .testing_utils import (
+ floats_tensor,
+ load_hf_numpy,
+ load_image,
+ load_numpy,
+ parse_flag_from_env,
+ require_torch_gpu,
+ slow,
+ torch_all_close,
+ torch_device,
+ )
+
+
+logger = get_logger(__name__)
+
+
+hf_cache_home = os.path.expanduser(
+ os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
+)
+default_cache_path = os.path.join(hf_cache_home, "diffusers")
+
+
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "diffusion_pytorch_model.bin"
+FLAX_WEIGHTS_NAME = "diffusion_flax_model.msgpack"
+ONNX_WEIGHTS_NAME = "model.onnx"
+ONNX_EXTERNAL_WEIGHTS_NAME = "weights.pb"
+HUGGINGFACE_CO_RESOLVE_ENDPOINT = "https://huggingface.co"
+DIFFUSERS_CACHE = default_cache_path
+DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules"
+HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules"))
diff --git a/src/model/TextGen/diffusers/utils/deprecation_utils.py b/src/model/TextGen/diffusers/utils/deprecation_utils.py
new file mode 100644
index 0000000..eac4303
--- /dev/null
+++ b/src/model/TextGen/diffusers/utils/deprecation_utils.py
@@ -0,0 +1,49 @@
+import inspect
+import warnings
+from typing import Any, Dict, Optional, Union
+
+from packaging import version
+
+
+def deprecate(*args, take_from: Optional[Union[Dict, Any]] = None, standard_warn=True):
+ from .. import __version__
+
+ deprecated_kwargs = take_from
+ values = ()
+ if not isinstance(args[0], tuple):
+ args = (args,)
+
+ for attribute, version_name, message in args:
+ if version.parse(version.parse(__version__).base_version) >= version.parse(version_name):
+ raise ValueError(
+ f"The deprecation tuple {(attribute, version_name, message)} should be removed since diffusers'"
+ f" version {__version__} is >= {version_name}"
+ )
+
+ warning = None
+ if isinstance(deprecated_kwargs, dict) and attribute in deprecated_kwargs:
+ values += (deprecated_kwargs.pop(attribute),)
+ warning = f"The `{attribute}` argument is deprecated and will be removed in version {version_name}."
+ elif hasattr(deprecated_kwargs, attribute):
+ values += (getattr(deprecated_kwargs, attribute),)
+ warning = f"The `{attribute}` attribute is deprecated and will be removed in version {version_name}."
+ elif deprecated_kwargs is None:
+ warning = f"`{attribute}` is deprecated and will be removed in version {version_name}."
+
+ if warning is not None:
+ warning = warning + " " if standard_warn else ""
+ warnings.warn(warning + message, DeprecationWarning)
+
+ if isinstance(deprecated_kwargs, dict) and len(deprecated_kwargs) > 0:
+ call_frame = inspect.getouterframes(inspect.currentframe())[1]
+ filename = call_frame.filename
+ line_number = call_frame.lineno
+ function = call_frame.function
+ key, value = next(iter(deprecated_kwargs.items()))
+ raise TypeError(f"{function} in {filename} line {line_number-1} got an unexpected keyword argument `{key}`")
+
+ if len(values) == 0:
+ return
+ elif len(values) == 1:
+ return values[0]
+ return values
diff --git a/src/model/TextGen/diffusers/utils/dummy_flax_and_transformers_objects.py b/src/model/TextGen/diffusers/utils/dummy_flax_and_transformers_objects.py
new file mode 100644
index 0000000..14830bc
--- /dev/null
+++ b/src/model/TextGen/diffusers/utils/dummy_flax_and_transformers_objects.py
@@ -0,0 +1,19 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+# flake8: noqa
+
+from ..utils import DummyObject, requires_backends
+
+
+class FlaxStableDiffusionPipeline(metaclass=DummyObject):
+ _backends = ["flax", "transformers"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["flax", "transformers"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["flax", "transformers"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["flax", "transformers"])
diff --git a/src/model/TextGen/diffusers/utils/dummy_flax_objects.py b/src/model/TextGen/diffusers/utils/dummy_flax_objects.py
new file mode 100644
index 0000000..8e308bb
--- /dev/null
+++ b/src/model/TextGen/diffusers/utils/dummy_flax_objects.py
@@ -0,0 +1,184 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+# flake8: noqa
+
+from ..utils import DummyObject, requires_backends
+
+
+class FlaxModelMixin(metaclass=DummyObject):
+ _backends = ["flax"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["flax"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+
+class FlaxUNet2DConditionModel(metaclass=DummyObject):
+ _backends = ["flax"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["flax"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+
+class FlaxAutoencoderKL(metaclass=DummyObject):
+ _backends = ["flax"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["flax"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+
+class FlaxDiffusionPipeline(metaclass=DummyObject):
+ _backends = ["flax"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["flax"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+
+class FlaxDDIMScheduler(metaclass=DummyObject):
+ _backends = ["flax"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["flax"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+
+class FlaxDDPMScheduler(metaclass=DummyObject):
+ _backends = ["flax"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["flax"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+
+class FlaxDPMSolverMultistepScheduler(metaclass=DummyObject):
+ _backends = ["flax"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["flax"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+
+class FlaxKarrasVeScheduler(metaclass=DummyObject):
+ _backends = ["flax"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["flax"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+
+class FlaxLMSDiscreteScheduler(metaclass=DummyObject):
+ _backends = ["flax"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["flax"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+
+class FlaxPNDMScheduler(metaclass=DummyObject):
+ _backends = ["flax"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["flax"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+
+class FlaxSchedulerMixin(metaclass=DummyObject):
+ _backends = ["flax"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["flax"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+
+class FlaxScoreSdeVeScheduler(metaclass=DummyObject):
+ _backends = ["flax"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["flax"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["flax"])
diff --git a/src/model/TextGen/diffusers/utils/dummy_pt_objects.py b/src/model/TextGen/diffusers/utils/dummy_pt_objects.py
new file mode 100644
index 0000000..af2e0c7
--- /dev/null
+++ b/src/model/TextGen/diffusers/utils/dummy_pt_objects.py
@@ -0,0 +1,482 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+# flake8: noqa
+
+from ..utils import DummyObject, requires_backends
+
+
+class ModelMixin(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class AutoencoderKL(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class Transformer2DModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class UNet1DModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class UNet2DConditionModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class UNet2DModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class VQModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+def get_constant_schedule(*args, **kwargs):
+ requires_backends(get_constant_schedule, ["torch"])
+
+
+def get_constant_schedule_with_warmup(*args, **kwargs):
+ requires_backends(get_constant_schedule_with_warmup, ["torch"])
+
+
+def get_cosine_schedule_with_warmup(*args, **kwargs):
+ requires_backends(get_cosine_schedule_with_warmup, ["torch"])
+
+
+def get_cosine_with_hard_restarts_schedule_with_warmup(*args, **kwargs):
+ requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, ["torch"])
+
+
+def get_linear_schedule_with_warmup(*args, **kwargs):
+ requires_backends(get_linear_schedule_with_warmup, ["torch"])
+
+
+def get_polynomial_decay_schedule_with_warmup(*args, **kwargs):
+ requires_backends(get_polynomial_decay_schedule_with_warmup, ["torch"])
+
+
+def get_scheduler(*args, **kwargs):
+ requires_backends(get_scheduler, ["torch"])
+
+
+class DiffusionPipeline(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class DanceDiffusionPipeline(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class DDIMPipeline(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class DDPMPipeline(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class KarrasVePipeline(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class LDMPipeline(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class LDMSuperResolutionPipeline(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class PNDMPipeline(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class RePaintPipeline(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class ScoreSdeVePipeline(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class DDIMScheduler(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class DDPMScheduler(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class DPMSolverMultistepScheduler(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class EulerAncestralDiscreteScheduler(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class EulerDiscreteScheduler(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class IPNDMScheduler(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class KarrasVeScheduler(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class PNDMScheduler(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class RePaintScheduler(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class SchedulerMixin(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class ScoreSdeVeScheduler(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class VQDiffusionScheduler(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
+class EMAModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
diff --git a/src/model/TextGen/diffusers/utils/dummy_torch_and_scipy_objects.py b/src/model/TextGen/diffusers/utils/dummy_torch_and_scipy_objects.py
new file mode 100644
index 0000000..13f1734
--- /dev/null
+++ b/src/model/TextGen/diffusers/utils/dummy_torch_and_scipy_objects.py
@@ -0,0 +1,19 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+# flake8: noqa
+
+from ..utils import DummyObject, requires_backends
+
+
+class LMSDiscreteScheduler(metaclass=DummyObject):
+ _backends = ["torch", "scipy"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "scipy"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "scipy"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "scipy"])
diff --git a/src/model/TextGen/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py b/src/model/TextGen/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
new file mode 100644
index 0000000..2210200
--- /dev/null
+++ b/src/model/TextGen/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
@@ -0,0 +1,64 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+# flake8: noqa
+
+from ..utils import DummyObject, requires_backends
+
+
+class OnnxStableDiffusionImg2ImgPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers", "onnx"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers", "onnx"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers", "onnx"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class OnnxStableDiffusionInpaintPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers", "onnx"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers", "onnx"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers", "onnx"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class OnnxStableDiffusionPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers", "onnx"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers", "onnx"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers", "onnx"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class StableDiffusionOnnxPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers", "onnx"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers", "onnx"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers", "onnx"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers", "onnx"])
diff --git a/src/model/TextGen/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/model/TextGen/diffusers/utils/dummy_torch_and_transformers_objects.py
new file mode 100644
index 0000000..63e8a60
--- /dev/null
+++ b/src/model/TextGen/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -0,0 +1,109 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+# flake8: noqa
+
+from ..utils import DummyObject, requires_backends
+
+
+class CycleDiffusionPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+
+class LDMTextToImagePipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionImg2ImgPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionInpaintPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionInpaintPipelineLegacy(metaclass=DummyObject):
+ _backends = ["torch", "transformers"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+
+class VQDiffusionPipeline(metaclass=DummyObject):
+ _backends = ["torch", "transformers"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch", "transformers"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch", "transformers"])
diff --git a/src/model/TextGen/diffusers/utils/import_utils.py b/src/model/TextGen/diffusers/utils/import_utils.py
new file mode 100644
index 0000000..005cbb6
--- /dev/null
+++ b/src/model/TextGen/diffusers/utils/import_utils.py
@@ -0,0 +1,349 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Import utilities: Utilities related to imports and our lazy inits.
+"""
+import importlib.util
+import operator as op
+import os
+import sys
+from collections import OrderedDict
+from typing import Union
+
+from packaging import version
+from packaging.version import Version, parse
+
+from . import logging
+
+
+# The package importlib_metadata is in a different place, depending on the python version.
+if sys.version_info < (3, 8):
+ import importlib_metadata
+else:
+ import importlib.metadata as importlib_metadata
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
+
+USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+USE_JAX = os.environ.get("USE_FLAX", "AUTO").upper()
+
+STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
+
+_torch_version = "N/A"
+if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
+ _torch_available = importlib.util.find_spec("torch") is not None
+ if _torch_available:
+ try:
+ _torch_version = importlib_metadata.version("torch")
+ logger.info(f"PyTorch version {_torch_version} available.")
+ except importlib_metadata.PackageNotFoundError:
+ _torch_available = False
+else:
+ logger.info("Disabling PyTorch because USE_TF is set")
+ _torch_available = False
+
+
+_tf_version = "N/A"
+if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
+ _tf_available = importlib.util.find_spec("tensorflow") is not None
+ if _tf_available:
+ candidates = (
+ "tensorflow",
+ "tensorflow-cpu",
+ "tensorflow-gpu",
+ "tf-nightly",
+ "tf-nightly-cpu",
+ "tf-nightly-gpu",
+ "intel-tensorflow",
+ "intel-tensorflow-avx512",
+ "tensorflow-rocm",
+ "tensorflow-macos",
+ "tensorflow-aarch64",
+ )
+ _tf_version = None
+ # For the metadata, we have to look for both tensorflow and tensorflow-cpu
+ for pkg in candidates:
+ try:
+ _tf_version = importlib_metadata.version(pkg)
+ break
+ except importlib_metadata.PackageNotFoundError:
+ pass
+ _tf_available = _tf_version is not None
+ if _tf_available:
+ if version.parse(_tf_version) < version.parse("2"):
+ logger.info(f"TensorFlow found but with version {_tf_version}. Diffusers requires version 2 minimum.")
+ _tf_available = False
+ else:
+ logger.info(f"TensorFlow version {_tf_version} available.")
+else:
+ logger.info("Disabling Tensorflow because USE_TORCH is set")
+ _tf_available = False
+
+_jax_version = "N/A"
+_flax_version = "N/A"
+if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
+ _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
+ if _flax_available:
+ try:
+ _jax_version = importlib_metadata.version("jax")
+ _flax_version = importlib_metadata.version("flax")
+ logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
+ except importlib_metadata.PackageNotFoundError:
+ _flax_available = False
+else:
+ _flax_available = False
+
+
+_transformers_available = importlib.util.find_spec("transformers") is not None
+try:
+ _transformers_version = importlib_metadata.version("transformers")
+ logger.debug(f"Successfully imported transformers version {_transformers_version}")
+except importlib_metadata.PackageNotFoundError:
+ _transformers_available = False
+
+
+_inflect_available = importlib.util.find_spec("inflect") is not None
+try:
+ _inflect_version = importlib_metadata.version("inflect")
+ logger.debug(f"Successfully imported inflect version {_inflect_version}")
+except importlib_metadata.PackageNotFoundError:
+ _inflect_available = False
+
+
+_unidecode_available = importlib.util.find_spec("unidecode") is not None
+try:
+ _unidecode_version = importlib_metadata.version("unidecode")
+ logger.debug(f"Successfully imported unidecode version {_unidecode_version}")
+except importlib_metadata.PackageNotFoundError:
+ _unidecode_available = False
+
+
+_modelcards_available = importlib.util.find_spec("modelcards") is not None
+try:
+ _modelcards_version = importlib_metadata.version("modelcards")
+ logger.debug(f"Successfully imported modelcards version {_modelcards_version}")
+except importlib_metadata.PackageNotFoundError:
+ _modelcards_available = False
+
+
+_onnxruntime_version = "N/A"
+_onnx_available = importlib.util.find_spec("onnxruntime") is not None
+if _onnx_available:
+ candidates = ("onnxruntime", "onnxruntime-gpu", "onnxruntime-directml", "onnxruntime-openvino")
+ _onnxruntime_version = None
+ # For the metadata, we have to look for both onnxruntime and onnxruntime-gpu
+ for pkg in candidates:
+ try:
+ _onnxruntime_version = importlib_metadata.version(pkg)
+ break
+ except importlib_metadata.PackageNotFoundError:
+ pass
+ _onnx_available = _onnxruntime_version is not None
+ if _onnx_available:
+ logger.debug(f"Successfully imported onnxruntime version {_onnxruntime_version}")
+
+
+_scipy_available = importlib.util.find_spec("scipy") is not None
+try:
+ _scipy_version = importlib_metadata.version("scipy")
+ logger.debug(f"Successfully imported transformers version {_scipy_version}")
+except importlib_metadata.PackageNotFoundError:
+ _scipy_available = False
+
+_accelerate_available = importlib.util.find_spec("accelerate") is not None
+try:
+ _accelerate_version = importlib_metadata.version("accelerate")
+ logger.debug(f"Successfully imported accelerate version {_accelerate_version}")
+except importlib_metadata.PackageNotFoundError:
+ _accelerate_available = False
+
+_xformers_available = importlib.util.find_spec("xformers") is not None
+try:
+ _xformers_version = importlib_metadata.version("xformers")
+ if _torch_available:
+ import torch
+
+ if torch.__version__ < version.Version("1.12"):
+ raise ValueError("PyTorch should be >= 1.12")
+ logger.debug(f"Successfully imported xformers version {_xformers_version}")
+except importlib_metadata.PackageNotFoundError:
+ _xformers_available = False
+
+
+def is_torch_available():
+ return _torch_available
+
+
+def is_tf_available():
+ return _tf_available
+
+
+def is_flax_available():
+ return _flax_available
+
+
+def is_transformers_available():
+ return _transformers_available
+
+
+def is_inflect_available():
+ return _inflect_available
+
+
+def is_unidecode_available():
+ return _unidecode_available
+
+
+def is_modelcards_available():
+ return _modelcards_available
+
+
+def is_onnx_available():
+ return _onnx_available
+
+
+def is_scipy_available():
+ return _scipy_available
+
+
+def is_xformers_available():
+ return _xformers_available
+
+
+def is_accelerate_available():
+ return _accelerate_available
+
+
+# docstyle-ignore
+FLAX_IMPORT_ERROR = """
+{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
+installation page: https://github.com/google/flax and follow the ones that match your environment.
+"""
+
+# docstyle-ignore
+INFLECT_IMPORT_ERROR = """
+{0} requires the inflect library but it was not found in your environment. You can install it with pip: `pip install
+inflect`
+"""
+
+# docstyle-ignore
+PYTORCH_IMPORT_ERROR = """
+{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
+installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
+"""
+
+# docstyle-ignore
+ONNX_IMPORT_ERROR = """
+{0} requires the onnxruntime library but it was not found in your environment. You can install it with pip: `pip
+install onnxruntime`
+"""
+
+# docstyle-ignore
+SCIPY_IMPORT_ERROR = """
+{0} requires the scipy library but it was not found in your environment. You can install it with pip: `pip install
+scipy`
+"""
+
+# docstyle-ignore
+TENSORFLOW_IMPORT_ERROR = """
+{0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
+installation page: https://www.tensorflow.org/install and follow the ones that match your environment.
+"""
+
+# docstyle-ignore
+TRANSFORMERS_IMPORT_ERROR = """
+{0} requires the transformers library but it was not found in your environment. You can install it with pip: `pip
+install transformers`
+"""
+
+# docstyle-ignore
+UNIDECODE_IMPORT_ERROR = """
+{0} requires the unidecode library but it was not found in your environment. You can install it with pip: `pip install
+Unidecode`
+"""
+
+
+BACKENDS_MAPPING = OrderedDict(
+ [
+ ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
+ ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)),
+ ("onnx", (is_onnx_available, ONNX_IMPORT_ERROR)),
+ ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
+ ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
+ ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
+ ("transformers", (is_transformers_available, TRANSFORMERS_IMPORT_ERROR)),
+ ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)),
+ ]
+)
+
+
+def requires_backends(obj, backends):
+ if not isinstance(backends, (list, tuple)):
+ backends = [backends]
+
+ name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
+ checks = (BACKENDS_MAPPING[backend] for backend in backends)
+ failed = [msg.format(name) for available, msg in checks if not available()]
+ if failed:
+ raise ImportError("".join(failed))
+
+
+class DummyObject(type):
+ """
+ Metaclass for the dummy objects. Any class inheriting from it will return the ImportError generated by
+ `requires_backend` each time a user tries to access any method of that class.
+ """
+
+ def __getattr__(cls, key):
+ if key.startswith("_"):
+ return super().__getattr__(cls, key)
+ requires_backends(cls, cls._backends)
+
+
+# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
+def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
+ """
+ Args:
+ Compares a library version to some requirement using a given operation.
+ library_or_version (`str` or `packaging.version.Version`):
+ A library name or a version to check.
+ operation (`str`):
+ A string representation of an operator, such as `">"` or `"<="`.
+ requirement_version (`str`):
+ The version to compare the library version against
+ """
+ if operation not in STR_OPERATION_TO_FUNC.keys():
+ raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}")
+ operation = STR_OPERATION_TO_FUNC[operation]
+ if isinstance(library_or_version, str):
+ library_or_version = parse(importlib_metadata.version(library_or_version))
+ return operation(library_or_version, parse(requirement_version))
+
+
+# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L338
+def is_torch_version(operation: str, version: str):
+ """
+ Args:
+ Compares the current PyTorch version to a given reference with an operation.
+ operation (`str`):
+ A string representation of an operator, such as `">"` or `"<="`
+ version (`str`):
+ A string version of PyTorch
+ """
+ return compare_versions(parse(_torch_version), operation, version)
diff --git a/src/model/TextGen/diffusers/utils/logging.py b/src/model/TextGen/diffusers/utils/logging.py
new file mode 100644
index 0000000..8c1c77d
--- /dev/null
+++ b/src/model/TextGen/diffusers/utils/logging.py
@@ -0,0 +1,340 @@
+# coding=utf-8
+# Copyright 2020 Optuna, Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Logging utilities."""
+
+import logging
+import os
+import sys
+import threading
+from logging import CRITICAL # NOQA
+from logging import DEBUG # NOQA
+from logging import ERROR # NOQA
+from logging import FATAL # NOQA
+from logging import INFO # NOQA
+from logging import NOTSET # NOQA
+from logging import WARN # NOQA
+from logging import WARNING # NOQA
+from typing import Optional
+
+from tqdm import auto as tqdm_lib
+
+
+_lock = threading.Lock()
+_default_handler: Optional[logging.Handler] = None
+
+log_levels = {
+ "debug": logging.DEBUG,
+ "info": logging.INFO,
+ "warning": logging.WARNING,
+ "error": logging.ERROR,
+ "critical": logging.CRITICAL,
+}
+
+_default_log_level = logging.WARNING
+
+_tqdm_active = True
+
+
+def _get_default_logging_level():
+ """
+ If DIFFUSERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
+ not - fall back to `_default_log_level`
+ """
+ env_level_str = os.getenv("DIFFUSERS_VERBOSITY", None)
+ if env_level_str:
+ if env_level_str in log_levels:
+ return log_levels[env_level_str]
+ else:
+ logging.getLogger().warning(
+ f"Unknown option DIFFUSERS_VERBOSITY={env_level_str}, "
+ f"has to be one of: { ', '.join(log_levels.keys()) }"
+ )
+ return _default_log_level
+
+
+def _get_library_name() -> str:
+ return __name__.split(".")[0]
+
+
+def _get_library_root_logger() -> logging.Logger:
+ return logging.getLogger(_get_library_name())
+
+
+def _configure_library_root_logger() -> None:
+ global _default_handler
+
+ with _lock:
+ if _default_handler:
+ # This library has already configured the library root logger.
+ return
+ _default_handler = logging.StreamHandler() # Set sys.stderr as stream.
+ _default_handler.flush = sys.stderr.flush
+
+ # Apply our default configuration to the library root logger.
+ library_root_logger = _get_library_root_logger()
+ library_root_logger.addHandler(_default_handler)
+ library_root_logger.setLevel(_get_default_logging_level())
+ library_root_logger.propagate = False
+
+
+def _reset_library_root_logger() -> None:
+ global _default_handler
+
+ with _lock:
+ if not _default_handler:
+ return
+
+ library_root_logger = _get_library_root_logger()
+ library_root_logger.removeHandler(_default_handler)
+ library_root_logger.setLevel(logging.NOTSET)
+ _default_handler = None
+
+
+def get_log_levels_dict():
+ return log_levels
+
+
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+ """
+ Return a logger with the specified name.
+
+ This function is not supposed to be directly accessed unless you are writing a custom diffusers module.
+ """
+
+ if name is None:
+ name = _get_library_name()
+
+ _configure_library_root_logger()
+ return logging.getLogger(name)
+
+
+def get_verbosity() -> int:
+ """
+ Return the current level for the 🤗 Diffusers' root logger as an int.
+
+ Returns:
+ `int`: The logging level.
+
+
+
+ 🤗 Diffusers has following logging levels:
+
+ - 50: `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL`
+ - 40: `diffusers.logging.ERROR`
+ - 30: `diffusers.logging.WARNING` or `diffusers.logging.WARN`
+ - 20: `diffusers.logging.INFO`
+ - 10: `diffusers.logging.DEBUG`
+
+ """
+
+ _configure_library_root_logger()
+ return _get_library_root_logger().getEffectiveLevel()
+
+
+def set_verbosity(verbosity: int) -> None:
+ """
+ Set the verbosity level for the 🤗 Diffusers' root logger.
+
+ Args:
+ verbosity (`int`):
+ Logging level, e.g., one of:
+
+ - `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL`
+ - `diffusers.logging.ERROR`
+ - `diffusers.logging.WARNING` or `diffusers.logging.WARN`
+ - `diffusers.logging.INFO`
+ - `diffusers.logging.DEBUG`
+ """
+
+ _configure_library_root_logger()
+ _get_library_root_logger().setLevel(verbosity)
+
+
+def set_verbosity_info():
+ """Set the verbosity to the `INFO` level."""
+ return set_verbosity(INFO)
+
+
+def set_verbosity_warning():
+ """Set the verbosity to the `WARNING` level."""
+ return set_verbosity(WARNING)
+
+
+def set_verbosity_debug():
+ """Set the verbosity to the `DEBUG` level."""
+ return set_verbosity(DEBUG)
+
+
+def set_verbosity_error():
+ """Set the verbosity to the `ERROR` level."""
+ return set_verbosity(ERROR)
+
+
+def disable_default_handler() -> None:
+ """Disable the default handler of the HuggingFace Diffusers' root logger."""
+
+ _configure_library_root_logger()
+
+ assert _default_handler is not None
+ _get_library_root_logger().removeHandler(_default_handler)
+
+
+def enable_default_handler() -> None:
+ """Enable the default handler of the HuggingFace Diffusers' root logger."""
+
+ _configure_library_root_logger()
+
+ assert _default_handler is not None
+ _get_library_root_logger().addHandler(_default_handler)
+
+
+def add_handler(handler: logging.Handler) -> None:
+ """adds a handler to the HuggingFace Diffusers' root logger."""
+
+ _configure_library_root_logger()
+
+ assert handler is not None
+ _get_library_root_logger().addHandler(handler)
+
+
+def remove_handler(handler: logging.Handler) -> None:
+ """removes given handler from the HuggingFace Diffusers' root logger."""
+
+ _configure_library_root_logger()
+
+ assert handler is not None and handler not in _get_library_root_logger().handlers
+ _get_library_root_logger().removeHandler(handler)
+
+
+def disable_propagation() -> None:
+ """
+ Disable propagation of the library log outputs. Note that log propagation is disabled by default.
+ """
+
+ _configure_library_root_logger()
+ _get_library_root_logger().propagate = False
+
+
+def enable_propagation() -> None:
+ """
+ Enable propagation of the library log outputs. Please disable the HuggingFace Diffusers' default handler to prevent
+ double logging if the root logger has been configured.
+ """
+
+ _configure_library_root_logger()
+ _get_library_root_logger().propagate = True
+
+
+def enable_explicit_format() -> None:
+ """
+ Enable explicit formatting for every HuggingFace Diffusers' logger. The explicit formatter is as follows:
+ ```
+ [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
+ ```
+ All handlers currently bound to the root logger are affected by this method.
+ """
+ handlers = _get_library_root_logger().handlers
+
+ for handler in handlers:
+ formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
+ handler.setFormatter(formatter)
+
+
+def reset_format() -> None:
+ """
+ Resets the formatting for HuggingFace Diffusers' loggers.
+
+ All handlers currently bound to the root logger are affected by this method.
+ """
+ handlers = _get_library_root_logger().handlers
+
+ for handler in handlers:
+ handler.setFormatter(None)
+
+
+def warning_advice(self, *args, **kwargs):
+ """
+ This method is identical to `logger.warning()`, but if env var DIFFUSERS_NO_ADVISORY_WARNINGS=1 is set, this
+ warning will not be printed
+ """
+ no_advisory_warnings = os.getenv("DIFFUSERS_NO_ADVISORY_WARNINGS", False)
+ if no_advisory_warnings:
+ return
+ self.warning(*args, **kwargs)
+
+
+logging.Logger.warning_advice = warning_advice
+
+
+class EmptyTqdm:
+ """Dummy tqdm which doesn't do anything."""
+
+ def __init__(self, *args, **kwargs): # pylint: disable=unused-argument
+ self._iterator = args[0] if args else None
+
+ def __iter__(self):
+ return iter(self._iterator)
+
+ def __getattr__(self, _):
+ """Return empty function."""
+
+ def empty_fn(*args, **kwargs): # pylint: disable=unused-argument
+ return
+
+ return empty_fn
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type_, value, traceback):
+ return
+
+
+class _tqdm_cls:
+ def __call__(self, *args, **kwargs):
+ if _tqdm_active:
+ return tqdm_lib.tqdm(*args, **kwargs)
+ else:
+ return EmptyTqdm(*args, **kwargs)
+
+ def set_lock(self, *args, **kwargs):
+ self._lock = None
+ if _tqdm_active:
+ return tqdm_lib.tqdm.set_lock(*args, **kwargs)
+
+ def get_lock(self):
+ if _tqdm_active:
+ return tqdm_lib.tqdm.get_lock()
+
+
+tqdm = _tqdm_cls()
+
+
+def is_progress_bar_enabled() -> bool:
+ """Return a boolean indicating whether tqdm progress bars are enabled."""
+ global _tqdm_active
+ return bool(_tqdm_active)
+
+
+def enable_progress_bar():
+ """Enable tqdm progress bar."""
+ global _tqdm_active
+ _tqdm_active = True
+
+
+def disable_progress_bar():
+ """Disable tqdm progress bar."""
+ global _tqdm_active
+ _tqdm_active = False
diff --git a/src/model/TextGen/diffusers/utils/model_card_template.md b/src/model/TextGen/diffusers/utils/model_card_template.md
new file mode 100644
index 0000000..f19c85b
--- /dev/null
+++ b/src/model/TextGen/diffusers/utils/model_card_template.md
@@ -0,0 +1,50 @@
+---
+{{ card_data }}
+---
+
+
+
+# {{ model_name | default("Diffusion Model") }}
+
+## Model description
+
+This diffusion model is trained with the [🤗 Diffusers](https://github.com/huggingface/diffusers) library
+on the `{{ dataset_name }}` dataset.
+
+## Intended uses & limitations
+
+#### How to use
+
+```python
+# TODO: add an example code snippet for running this diffusion pipeline
+```
+
+#### Limitations and bias
+
+[TODO: provide examples of latent issues and potential remediations]
+
+## Training data
+
+[TODO: describe the data used to train the model]
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: {{ learning_rate }}
+- train_batch_size: {{ train_batch_size }}
+- eval_batch_size: {{ eval_batch_size }}
+- gradient_accumulation_steps: {{ gradient_accumulation_steps }}
+- optimizer: AdamW with betas=({{ adam_beta1 }}, {{ adam_beta2 }}), weight_decay={{ adam_weight_decay }} and epsilon={{ adam_epsilon }}
+- lr_scheduler: {{ lr_scheduler }}
+- lr_warmup_steps: {{ lr_warmup_steps }}
+- ema_inv_gamma: {{ ema_inv_gamma }}
+- ema_inv_gamma: {{ ema_power }}
+- ema_inv_gamma: {{ ema_max_decay }}
+- mixed_precision: {{ mixed_precision }}
+
+### Training results
+
+📈 [TensorBoard logs](https://huggingface.co/{{ repo_name }}/tensorboard?#scalars)
+
+
diff --git a/src/model/TextGen/diffusers/utils/outputs.py b/src/model/TextGen/diffusers/utils/outputs.py
new file mode 100644
index 0000000..5d902dd
--- /dev/null
+++ b/src/model/TextGen/diffusers/utils/outputs.py
@@ -0,0 +1,108 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Generic utilities
+"""
+
+from collections import OrderedDict
+from dataclasses import fields
+from typing import Any, Tuple
+
+import numpy as np
+
+from .import_utils import is_torch_available
+
+
+def is_tensor(x):
+ """
+ Tests if `x` is a `torch.Tensor` or `np.ndarray`.
+ """
+ if is_torch_available():
+ import torch
+
+ if isinstance(x, torch.Tensor):
+ return True
+
+ return isinstance(x, np.ndarray)
+
+
+class BaseOutput(OrderedDict):
+ """
+ Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
+ tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
+ python dictionary.
+
+
+
+ You can't unpack a `BaseOutput` directly. Use the [`~utils.BaseOutput.to_tuple`] method to convert it to a tuple
+ before.
+
+
+ """
+
+ def __post_init__(self):
+ class_fields = fields(self)
+
+ # Safety and consistency checks
+ if not len(class_fields):
+ raise ValueError(f"{self.__class__.__name__} has no fields.")
+
+ first_field = getattr(self, class_fields[0].name)
+ other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
+
+ if other_fields_are_none and isinstance(first_field, dict):
+ for key, value in first_field.items():
+ self[key] = value
+ else:
+ for field in class_fields:
+ v = getattr(self, field.name)
+ if v is not None:
+ self[field.name] = v
+
+ def __delitem__(self, *args, **kwargs):
+ raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+ def setdefault(self, *args, **kwargs):
+ raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+ def pop(self, *args, **kwargs):
+ raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+ def update(self, *args, **kwargs):
+ raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+ def __getitem__(self, k):
+ if isinstance(k, str):
+ inner_dict = {k: v for (k, v) in self.items()}
+ return inner_dict[k]
+ else:
+ return self.to_tuple()[k]
+
+ def __setattr__(self, name, value):
+ if name in self.keys() and value is not None:
+ # Don't call self.__setitem__ to avoid recursion errors
+ super().__setitem__(name, value)
+ super().__setattr__(name, value)
+
+ def __setitem__(self, key, value):
+ # Will raise a KeyException if needed
+ super().__setitem__(key, value)
+ # Don't call self.__setattr__ to avoid recursion errors
+ super().__setattr__(key, value)
+
+ def to_tuple(self) -> Tuple[Any]:
+ """
+ Convert self to a tuple containing all the attributes/keys that are not `None`.
+ """
+ return tuple(self[k] for k in self.keys())
diff --git a/src/model/TextGen/diffusers/utils/testing_utils.py b/src/model/TextGen/diffusers/utils/testing_utils.py
new file mode 100644
index 0000000..bf398e5
--- /dev/null
+++ b/src/model/TextGen/diffusers/utils/testing_utils.py
@@ -0,0 +1,393 @@
+import inspect
+import logging
+import os
+import random
+import re
+import unittest
+import urllib.parse
+from distutils.util import strtobool
+from io import BytesIO, StringIO
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+
+import PIL.Image
+import PIL.ImageOps
+import requests
+from packaging import version
+
+from .import_utils import is_flax_available, is_onnx_available, is_torch_available
+
+
+global_rng = random.Random()
+
+
+if is_torch_available():
+ import torch
+
+ torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+ is_torch_higher_equal_than_1_12 = version.parse(version.parse(torch.__version__).base_version) >= version.parse(
+ "1.12"
+ )
+
+ if is_torch_higher_equal_than_1_12:
+ # Some builds of torch 1.12 don't have the mps backend registered. See #892 for more details
+ mps_backend_registered = hasattr(torch.backends, "mps")
+ torch_device = "mps" if (mps_backend_registered and torch.backends.mps.is_available()) else torch_device
+
+
+def torch_all_close(a, b, *args, **kwargs):
+ if not is_torch_available():
+ raise ValueError("PyTorch needs to be installed to use this function.")
+ if not torch.allclose(a, b, *args, **kwargs):
+ assert False, f"Max diff is absolute {(a - b).abs().max()}. Diff tensor is {(a - b).abs()}."
+ return True
+
+
+def get_tests_dir(append_path=None):
+ """
+ Args:
+ append_path: optional path to append to the tests dir path
+ Return:
+ The full path to the `tests` dir, so that the tests can be invoked from anywhere. Optionally `append_path` is
+ joined after the `tests` dir the former is provided.
+ """
+ # this function caller's __file__
+ caller__file__ = inspect.stack()[1][1]
+ tests_dir = os.path.abspath(os.path.dirname(caller__file__))
+
+ while not tests_dir.endswith("tests"):
+ tests_dir = os.path.dirname(tests_dir)
+
+ if append_path:
+ return os.path.join(tests_dir, append_path)
+ else:
+ return tests_dir
+
+
+def parse_flag_from_env(key, default=False):
+ try:
+ value = os.environ[key]
+ except KeyError:
+ # KEY isn't set, default to `default`.
+ _value = default
+ else:
+ # KEY is set, convert it to True or False.
+ try:
+ _value = strtobool(value)
+ except ValueError:
+ # More values are supported, but let's keep the message simple.
+ raise ValueError(f"If set, {key} must be yes or no.")
+ return _value
+
+
+_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
+
+
+def floats_tensor(shape, scale=1.0, rng=None, name=None):
+ """Creates a random float32 tensor"""
+ if rng is None:
+ rng = global_rng
+
+ total_dims = 1
+ for dim in shape:
+ total_dims *= dim
+
+ values = []
+ for _ in range(total_dims):
+ values.append(rng.random() * scale)
+
+ return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous()
+
+
+def slow(test_case):
+ """
+ Decorator marking a test as slow.
+
+ Slow tests are skipped by default. Set the RUN_SLOW environment variable to a truthy value to run them.
+
+ """
+ return unittest.skipUnless(_run_slow_tests, "test is slow")(test_case)
+
+
+def require_torch(test_case):
+ """
+ Decorator marking a test that requires PyTorch. These tests are skipped when PyTorch isn't installed.
+ """
+ return unittest.skipUnless(is_torch_available(), "test requires PyTorch")(test_case)
+
+
+def require_torch_gpu(test_case):
+ """Decorator marking a test that requires CUDA and PyTorch."""
+ return unittest.skipUnless(is_torch_available() and torch_device == "cuda", "test requires PyTorch+CUDA")(
+ test_case
+ )
+
+
+def require_flax(test_case):
+ """
+ Decorator marking a test that requires JAX & Flax. These tests are skipped when one / both are not installed
+ """
+ return unittest.skipUnless(is_flax_available(), "test requires JAX & Flax")(test_case)
+
+
+def require_onnxruntime(test_case):
+ """
+ Decorator marking a test that requires onnxruntime. These tests are skipped when onnxruntime isn't installed.
+ """
+ return unittest.skipUnless(is_onnx_available(), "test requires onnxruntime")(test_case)
+
+
+def load_numpy(arry: Union[str, np.ndarray]) -> np.ndarray:
+ if isinstance(arry, str):
+ if arry.startswith("http://") or arry.startswith("https://"):
+ response = requests.get(arry)
+ response.raise_for_status()
+ arry = np.load(BytesIO(response.content))
+ elif os.path.isfile(arry):
+ arry = np.load(arry)
+ else:
+ raise ValueError(
+ f"Incorrect path or url, URLs must start with `http://` or `https://`, and {arry} is not a valid path"
+ )
+ elif isinstance(arry, np.ndarray):
+ pass
+ else:
+ raise ValueError(
+ "Incorrect format used for numpy ndarray. Should be an url linking to an image, a local path, or a"
+ " ndarray."
+ )
+
+ return arry
+
+
+def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
+ """
+ Args:
+ Loads `image` to a PIL Image.
+ image (`str` or `PIL.Image.Image`):
+ The image to convert to the PIL Image format.
+ Returns:
+ `PIL.Image.Image`: A PIL Image.
+ """
+ if isinstance(image, str):
+ if image.startswith("http://") or image.startswith("https://"):
+ image = PIL.Image.open(requests.get(image, stream=True).raw)
+ elif os.path.isfile(image):
+ image = PIL.Image.open(image)
+ else:
+ raise ValueError(
+ f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
+ )
+ elif isinstance(image, PIL.Image.Image):
+ image = image
+ else:
+ raise ValueError(
+ "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
+ )
+ image = PIL.ImageOps.exif_transpose(image)
+ image = image.convert("RGB")
+ return image
+
+
+def load_hf_numpy(path) -> np.ndarray:
+ if not path.startswith("http://") or path.startswith("https://"):
+ path = os.path.join(
+ "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main", urllib.parse.quote(path)
+ )
+
+ return load_numpy(path)
+
+
+# --- pytest conf functions --- #
+
+# to avoid multiple invocation from tests/conftest.py and examples/conftest.py - make sure it's called only once
+pytest_opt_registered = {}
+
+
+def pytest_addoption_shared(parser):
+ """
+ This function is to be called from `conftest.py` via `pytest_addoption` wrapper that has to be defined there.
+
+ It allows loading both `conftest.py` files at once without causing a failure due to adding the same `pytest`
+ option.
+
+ """
+ option = "--make-reports"
+ if option not in pytest_opt_registered:
+ parser.addoption(
+ option,
+ action="store",
+ default=False,
+ help="generate report files. The value of this option is used as a prefix to report names",
+ )
+ pytest_opt_registered[option] = 1
+
+
+def pytest_terminal_summary_main(tr, id):
+ """
+ Generate multiple reports at the end of test suite run - each report goes into a dedicated file in the current
+ directory. The report files are prefixed with the test suite name.
+
+ This function emulates --duration and -rA pytest arguments.
+
+ This function is to be called from `conftest.py` via `pytest_terminal_summary` wrapper that has to be defined
+ there.
+
+ Args:
+ - tr: `terminalreporter` passed from `conftest.py`
+ - id: unique id like `tests` or `examples` that will be incorporated into the final reports filenames - this is
+ needed as some jobs have multiple runs of pytest, so we can't have them overwrite each other.
+
+ NB: this functions taps into a private _pytest API and while unlikely, it could break should
+ pytest do internal changes - also it calls default internal methods of terminalreporter which
+ can be hijacked by various `pytest-` plugins and interfere.
+
+ """
+ from _pytest.config import create_terminal_writer
+
+ if not len(id):
+ id = "tests"
+
+ config = tr.config
+ orig_writer = config.get_terminal_writer()
+ orig_tbstyle = config.option.tbstyle
+ orig_reportchars = tr.reportchars
+
+ dir = "reports"
+ Path(dir).mkdir(parents=True, exist_ok=True)
+ report_files = {
+ k: f"{dir}/{id}_{k}.txt"
+ for k in [
+ "durations",
+ "errors",
+ "failures_long",
+ "failures_short",
+ "failures_line",
+ "passes",
+ "stats",
+ "summary_short",
+ "warnings",
+ ]
+ }
+
+ # custom durations report
+ # note: there is no need to call pytest --durations=XX to get this separate report
+ # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/runner.py#L66
+ dlist = []
+ for replist in tr.stats.values():
+ for rep in replist:
+ if hasattr(rep, "duration"):
+ dlist.append(rep)
+ if dlist:
+ dlist.sort(key=lambda x: x.duration, reverse=True)
+ with open(report_files["durations"], "w") as f:
+ durations_min = 0.05 # sec
+ f.write("slowest durations\n")
+ for i, rep in enumerate(dlist):
+ if rep.duration < durations_min:
+ f.write(f"{len(dlist)-i} durations < {durations_min} secs were omitted")
+ break
+ f.write(f"{rep.duration:02.2f}s {rep.when:<8} {rep.nodeid}\n")
+
+ def summary_failures_short(tr):
+ # expecting that the reports were --tb=long (default) so we chop them off here to the last frame
+ reports = tr.getreports("failed")
+ if not reports:
+ return
+ tr.write_sep("=", "FAILURES SHORT STACK")
+ for rep in reports:
+ msg = tr._getfailureheadline(rep)
+ tr.write_sep("_", msg, red=True, bold=True)
+ # chop off the optional leading extra frames, leaving only the last one
+ longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0, re.M | re.S)
+ tr._tw.line(longrepr)
+ # note: not printing out any rep.sections to keep the report short
+
+ # use ready-made report funcs, we are just hijacking the filehandle to log to a dedicated file each
+ # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/terminal.py#L814
+ # note: some pytest plugins may interfere by hijacking the default `terminalreporter` (e.g.
+ # pytest-instafail does that)
+
+ # report failures with line/short/long styles
+ config.option.tbstyle = "auto" # full tb
+ with open(report_files["failures_long"], "w") as f:
+ tr._tw = create_terminal_writer(config, f)
+ tr.summary_failures()
+
+ # config.option.tbstyle = "short" # short tb
+ with open(report_files["failures_short"], "w") as f:
+ tr._tw = create_terminal_writer(config, f)
+ summary_failures_short(tr)
+
+ config.option.tbstyle = "line" # one line per error
+ with open(report_files["failures_line"], "w") as f:
+ tr._tw = create_terminal_writer(config, f)
+ tr.summary_failures()
+
+ with open(report_files["errors"], "w") as f:
+ tr._tw = create_terminal_writer(config, f)
+ tr.summary_errors()
+
+ with open(report_files["warnings"], "w") as f:
+ tr._tw = create_terminal_writer(config, f)
+ tr.summary_warnings() # normal warnings
+ tr.summary_warnings() # final warnings
+
+ tr.reportchars = "wPpsxXEf" # emulate -rA (used in summary_passes() and short_test_summary())
+ with open(report_files["passes"], "w") as f:
+ tr._tw = create_terminal_writer(config, f)
+ tr.summary_passes()
+
+ with open(report_files["summary_short"], "w") as f:
+ tr._tw = create_terminal_writer(config, f)
+ tr.short_test_summary()
+
+ with open(report_files["stats"], "w") as f:
+ tr._tw = create_terminal_writer(config, f)
+ tr.summary_stats()
+
+ # restore:
+ tr._tw = orig_writer
+ tr.reportchars = orig_reportchars
+ config.option.tbstyle = orig_tbstyle
+
+
+class CaptureLogger:
+ """
+ Args:
+ Context manager to capture `logging` streams
+ logger: 'logging` logger object
+ Returns:
+ The captured output is available via `self.out`
+ Example:
+ ```python
+ >>> from diffusers import logging
+ >>> from diffusers.testing_utils import CaptureLogger
+
+ >>> msg = "Testing 1, 2, 3"
+ >>> logging.set_verbosity_info()
+ >>> logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.py")
+ >>> with CaptureLogger(logger) as cl:
+ ... logger.info(msg)
+ >>> assert cl.out, msg + "\n"
+ ```
+ """
+
+ def __init__(self, logger):
+ self.logger = logger
+ self.io = StringIO()
+ self.sh = logging.StreamHandler(self.io)
+ self.out = ""
+
+ def __enter__(self):
+ self.logger.addHandler(self.sh)
+ return self
+
+ def __exit__(self, *exc):
+ self.logger.removeHandler(self.sh)
+ self.out = self.io.getvalue()
+
+ def __repr__(self):
+ return f"captured: {self.out}\n"
diff --git a/src/model/TextGen/environment.yaml b/src/model/TextGen/environment.yaml
new file mode 100644
index 0000000..5ebc4f3
--- /dev/null
+++ b/src/model/TextGen/environment.yaml
@@ -0,0 +1,104 @@
+name: textctrl
+channels:
+ - conda-forge
+ - defaults
+dependencies:
+ - _libgcc_mutex=0.1=main
+ - _openmp_mutex=5.1=1_gnu
+ - ca-certificates=2024.3.11=h06a4308_0
+ - ld_impl_linux-64=2.38=h1181459_1
+ - libffi=3.4.4=h6a678d5_1
+ - libgcc-ng=11.2.0=h1234567_1
+ - libgomp=11.2.0=h1234567_1
+ - libstdcxx-ng=11.2.0=h1234567_1
+ - ncurses=6.4=h6a678d5_0
+ - openssl=3.0.13=h7f8727e_1
+ - pip=23.3.1=py38h06a4308_0
+ - python=3.8.19=h955ad1f_0
+ - readline=8.2=h5eee18b_0
+ - setuptools=68.2.2=py38h06a4308_0
+ - sqlite=3.45.3=h5eee18b_0
+ - tk=8.6.12=h1ccaba5_0
+ - wheel=0.43.0=py38h06a4308_0
+ - xz=5.4.6=h5eee18b_1
+ - zlib=1.2.13=h5eee18b_1
+ - pip:
+ - absl-py==2.1.0
+ - accelerate==0.30.0
+ - aiohttp==3.9.5
+ - aiosignal==1.3.1
+ - antlr4-python3-runtime==4.9.3
+ - appdirs==1.4.4
+ - async-timeout==4.0.3
+ - attrs==23.2.0
+ - cachetools==5.3.3
+ - certifi==2024.2.2
+ - charset-normalizer==3.3.2
+ - click==8.1.7
+ - contourpy==1.1.1
+ - cycler==0.12.1
+ - docker-pycreds==0.4.0
+ - einops==0.8.0
+ - filelock==3.14.0
+ - fonttools==4.51.0
+ - frozenlist==1.4.1
+ - fsspec==2024.3.1
+ - gitdb==4.0.11
+ - gitpython==3.1.43
+ - google-auth==2.29.0
+ - google-auth-oauthlib==0.4.6
+ - grpcio==1.63.0
+ - huggingface-hub==0.23.0
+ - idna==3.7
+ - importlib-metadata==7.1.0
+ - importlib-resources==6.4.0
+ - kiwisolver==1.4.5
+ - lightning-utilities==0.11.2
+ - lpips==0.1.4
+ - markdown==3.6
+ - markupsafe==2.1.5
+ - matplotlib==3.7.5
+ - multidict==6.0.5
+ - natsort==8.4.0
+ - numpy==1.24.4
+ - oauthlib==3.2.2
+ - omegaconf==2.3.0
+ - opencv-python==4.9.0.80
+ - packaging==24.0
+ - pillow==9.5.0
+ - protobuf==3.19.6
+ - psutil==5.9.8
+ - pyasn1==0.6.0
+ - pyasn1-modules==0.4.0
+ - pyparsing==3.1.2
+ - python-dateutil==2.9.0.post0
+ - pytorch-fid==0.3.0
+ - pytorch-lightning==1.9.1
+ - pyyaml==6.0.1
+ - regex==2024.5.15
+ - requests==2.31.0
+ - requests-oauthlib==2.0.0
+ - rsa==4.9
+ - safetensors==0.4.3
+ - scipy==1.10.1
+ - sentry-sdk==2.0.1
+ - setproctitle==1.3.3
+ - six==1.16.0
+ - smmap==5.0.1
+ - tensorboard==2.10.0
+ - tensorboard-data-server==0.6.1
+ - tensorboard-plugin-wit==1.8.1
+ - timm==0.9.16
+ - tokenizers==0.19.1
+ - torch==1.13.1+cu116
+ - torchaudio==0.13.1+cu116
+ - torchmetrics==1.3.2
+ - torchvision==0.14.1+cu116
+ - tqdm==4.66.4
+ - transformers==4.41.0
+ - typing-extensions==4.11.0
+ - urllib3==2.2.1
+ - wandb==0.16.6
+ - werkzeug==3.0.3
+ - yarl==1.9.4
+ - zipp==3.18.1
diff --git a/src/model/TextGen/evaluation/eval_utils.py b/src/model/TextGen/evaluation/eval_utils.py
new file mode 100644
index 0000000..7164717
--- /dev/null
+++ b/src/model/TextGen/evaluation/eval_utils.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+"""Module providing functionality surrounding gaussian function.
+"""
+import os
+import numpy as np
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision.transforms import Compose, ToTensor, Resize
+
+
+def fspecial_gauss(size, sigma):
+ """Function to mimic the 'fspecial' gaussian MATLAB function
+ """
+ x, y = np.mgrid[-size // 2 + 1:size //
+ 2 + 1, -size // 2 + 1:size // 2 + 1]
+ g = np.exp(-((x**2 + y**2) / (2.0 * sigma**2)))
+ return g / g.sum()
+
+
+def CheckImageFile(filename):
+ return any(filename.endswith(extention) for extention in [
+ '.png', '.PNG', '.jpg', '.JPG', '.jpeg', '.JPEG', '.bmp', '.BMP'])
+
+
+def ImageTransform(loadSize):
+ return Compose([
+ Resize(size=loadSize, interpolation=Image.BICUBIC),
+ ToTensor(),
+ ])
+
+
+class devdata(Dataset):
+ def __init__(self, dataRoot, gtRoot, loadSize=512):
+ super(devdata, self).__init__()
+ self.imageFiles = [os.path.join(dataRoot, filename) for filename
+ in os.listdir(dataRoot) if CheckImageFile(filename)]
+ self.gtFiles = [os.path.join(gtRoot, filename) for filename
+ in os.listdir(dataRoot) if CheckImageFile(filename)]
+ self.loadSize = loadSize
+
+ def __getitem__(self, index):
+ img = Image.open(self.imageFiles[index])
+ gt = Image.open(self.gtFiles[index])
+ to_scale = gt.size
+ inputImage = ImageTransform(to_scale)(img.convert('RGB'))
+ groundTruth = ImageTransform(to_scale)(gt.convert('RGB'))
+ path = self.imageFiles[index].split('/')[-1]
+
+ return inputImage, groundTruth, path
+
+ def __len__(self):
+ return len(self.imageFiles)
diff --git a/src/model/TextGen/evaluation/evaluation.py b/src/model/TextGen/evaluation/evaluation.py
new file mode 100644
index 0000000..0529795
--- /dev/null
+++ b/src/model/TextGen/evaluation/evaluation.py
@@ -0,0 +1,135 @@
+import os
+import math
+import argparse
+import torch
+import lpips
+import numpy as np
+from PIL import Image
+from torch.utils.data import DataLoader
+from scipy import signal, ndimage
+from tqdm import tqdm
+from pytorch_fid import fid_score
+from eval_utils import devdata, fspecial_gauss
+from torchvision.transforms import Compose, ToTensor, Resize
+
+
+def ssim(img1, img2, cs_map=False):
+ """Return the Structural Similarity Map corresponding to input images img1
+ and img2 (images are assumed to be uint8)
+
+ This function attempts to mimic precisely the functionality of ssim.m a
+ MATLAB provided by the author's of SSIM
+ https://ece.uwaterloo.ca/~z70wang/research/ssim/ssim_index.m
+ """
+ img1 = img1.astype(float)
+ img2 = img2.astype(float)
+
+ size = min(img1.shape[0], 11)
+ sigma = 1.5
+ window = fspecial_gauss(size, sigma)
+ K1 = 0.01
+ K2 = 0.03
+ L = 255 # bitdepth of image
+ C1 = (K1 * L) ** 2
+ C2 = (K2 * L) ** 2
+ mu1 = signal.fftconvolve(img1, window, mode='valid')
+ mu2 = signal.fftconvolve(img2, window, mode='valid')
+ mu1_sq = mu1 * mu1
+ mu2_sq = mu2 * mu2
+ mu1_mu2 = mu1 * mu2
+ sigma1_sq = signal.fftconvolve(img1 * img1, window, mode='valid') - mu1_sq
+ sigma2_sq = signal.fftconvolve(img2 * img2, window, mode='valid') - mu2_sq
+ sigma12 = signal.fftconvolve(img1 * img2, window, mode='valid') - mu1_mu2
+ if cs_map:
+ return (((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)),
+ (2.0 * sigma12 + C2) / (sigma1_sq + sigma2_sq + C2))
+ else:
+ return ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
+ (sigma1_sq + sigma2_sq + C2))
+
+
+def msssim(img1, img2):
+ """This function implements Multi-Scale Structural Similarity (MSSSIM) Image
+ Quality Assessment according to Z. Wang's "Multi-scale structural similarity
+ for image quality assessment" Invited Paper, IEEE Asilomar Conference on
+ Signals, Systems and Computers, Nov. 2003
+
+ Author's MATLAB implementation:-
+ http://www.cns.nyu.edu/~lcv/ssim/msssim.zip
+ """
+ level = 5
+ weight = np.array([0.0448, 0.2856, 0.3001, 0.2363, 0.1333])
+ downsample_filter = np.ones((2, 2)) / 4.0
+ mssim = np.array([])
+ mcs = np.array([])
+ for l in range(level):
+ ssim_map, cs_map = ssim(img1, img2, cs_map=True)
+ mssim = np.append(mssim, ssim_map.mean())
+ mcs = np.append(mcs, cs_map.mean())
+ filtered_im1 = ndimage.convolve(img1, downsample_filter, mode='reflect')
+ filtered_im2 = ndimage.convolve(img2, downsample_filter, mode='reflect')
+ im1 = filtered_im1[:: 2, :: 2]
+ im2 = filtered_im2[:: 2, :: 2]
+ # Note: Remove the negative and add it later to avoid NaN in exponential.
+ sign_mcs = np.sign(mcs[0: level - 1])
+ sign_mssim = np.sign(mssim[level - 1])
+ mcs_power = np.power(np.abs(mcs[0: level - 1]), weight[0: level - 1])
+ mssim_power = np.power(np.abs(mssim[level - 1]), weight[level - 1])
+ return np.prod(sign_mcs * mcs_power) * sign_mssim * mssim_power
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--target_path', type=str, default='', help='results')
+ parser.add_argument('--gt_path', type=str, default='', help='labels')
+ args = parser.parse_args()
+ img_path = args.target_path
+ gt_path = args.gt_path
+
+
+ sum_psnr = 0
+ sum_ssim = 0
+ sum_mse = 0
+ count = 0
+ sum_time = 0.0
+ l1_loss = 0
+
+ imgData = devdata(dataRoot=img_path, gtRoot=gt_path)
+ data_loader = DataLoader(
+ imgData,
+ batch_size=1,
+ shuffle=False,
+ num_workers=0,
+ drop_last=False)
+
+ for idx, (img, lbl, path) in tqdm(enumerate(data_loader), total=len(data_loader)):
+ try:
+ mse = ((lbl - img) ** 2).mean()
+ sum_mse += mse
+ if mse == 0:
+ continue
+ count += 1
+ psnr = 10 * math.log10(1 / mse)
+ sum_psnr += psnr
+
+ R = lbl[0, 0, :, :]
+ G = lbl[0, 1, :, :]
+ B = lbl[0, 2, :, :]
+ YGT = .299 * R + .587 * G + .114 * B
+ R = img[0, 0, :, :]
+ G = img[0, 1, :, :]
+ B = img[0, 2, :, :]
+ YBC = .299 * R + .587 * G + .114 * B
+ mssim = msssim(np.array(YGT * 255), np.array(YBC * 255))
+ sum_ssim += mssim
+ except Exception as e:
+ print("Skip img {} because of {}".format(path, e))
+
+ batch_size = 1
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+ dims = 2048
+ fid_value = fid_score.calculate_fid_given_paths([str(gt_path), str(img_path)], batch_size, device, dims)
+
+ print('SSIM:', sum_ssim / count)
+ print('PSNR:', sum_psnr / count)
+ print('MSE:', sum_mse.item() / count)
+ print('FID:', fid_value)
diff --git a/src/model/TextGen/inference.py b/src/model/TextGen/inference.py
new file mode 100644
index 0000000..ab0e90d
--- /dev/null
+++ b/src/model/TextGen/inference.py
@@ -0,0 +1,859 @@
+import torch
+import os
+import json
+import re
+import unicodedata
+from typing import Optional
+from src.trainer.CtrlBase import ControlBase
+from PIL import Image
+import numpy as np
+import torchvision.transforms as T
+from tqdm import tqdm
+from src.MuSA.GaMuSA_app import text_editing
+from src.MuSA.GaMuSA import GaMuSA
+from argparse import ArgumentParser
+from pytorch_lightning import seed_everything
+from utils import create_model, load_state_dict
+from PIL import ImageDraw
+
+try:
+ # Lazy optional import; only required for --ocr_mode
+ from paddleocr import PaddleOCR # type: ignore
+except Exception: # pragma: no cover
+ PaddleOCR = None
+
+# Optional: docTR for DBNet detection backend
+try:
+ from doctr.io import DocumentFile # type: ignore
+ from doctr.models import ocr_predictor as doctr_ocr_predictor # type: ignore
+except Exception:
+ DocumentFile = None
+ doctr_ocr_predictor = None
+
+
+def load_image(image_path, image_height=256, image_width=256):
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+ img = Image.open(image_path)
+ image = T.ToTensor()(T.Resize((image_height, image_width))(img.convert("RGB")))
+ image = image.to(device)
+ return image.unsqueeze(0)
+
+
+def create_parser():
+ parser = ArgumentParser()
+ parser.add_argument("--seed", type=int, default=42)
+ parser.add_argument("--target_height", default=256)
+ parser.add_argument("--teaget_width", default=256)
+ parser.add_argument("--style_height", default=256)
+ parser.add_argument("--style_width", default=256)
+ parser.add_argument("--ckpt_path", type=str, default="weights/model.pth")
+ parser.add_argument("--dataset_dir", type=str, default="example/")
+ parser.add_argument("--output_dir", type=str, default="example_result/")
+ parser.add_argument("--starting_layer", default=10, type=int)
+ parser.add_argument("--num_inference_steps", type=int, default=50)
+ parser.add_argument("--num_sample_per_image", default=1, type=int)
+ parser.add_argument("--guidance_scale", default=2, type=float)
+ parser.add_argument("--benchmark", default=True)
+ # OCR + rule-based editing
+ parser.add_argument("--ocr_mode", action="store_true", help="Enable OCR-driven rule-based text replacement")
+ parser.add_argument("--input_image", type=str, default=None, help="Path to a single input image when using --ocr_mode")
+ parser.add_argument("--rules_json", type=str, default=None, help="JSON file specifying replacements, e.g. {\"NOM\": \"NGUYEN\", \"GIVEN_NAMES\": \"AndrewNguyen\"}")
+ # OCR controls
+ parser.add_argument("--ocr_lang", type=str, default="fr", help="OCR language (e.g., fr, en, vi)")
+ parser.add_argument("--ocr_det_model", type=str, default="", help="Override detection model name (e.g., PP-OCRv5_server_det for DBNet)")
+ parser.add_argument("--ocr_rec_model", type=str, default="", help="Override recognition model name (e.g., PP-OCRv5_server_rec)")
+ parser.add_argument("--ocr_version", type=str, default="", help="Optional OCR version hint (e.g., PP-OCRv5)")
+ parser.add_argument("--detector_backend", type=str, default="paddle", choices=["paddle", "doctr"], help="Detector backend: paddle (default) or doctr DBNet")
+ parser.add_argument("--doctr_det_arch", type=str, default="db_resnet50", help="docTR detection architecture (e.g., db_resnet50)")
+ return parser
+
+
+def _strip_accents(text: str) -> str:
+ """Remove accents for robust matching."""
+ return "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn")
+
+
+def _sanitize(text: str) -> str:
+ """Uppercase and remove non-letters for coarse rule matching."""
+ text = _strip_accents(text)
+ text = re.sub(r"[^A-Za-z,\-\s]", "", text)
+ return text.strip().upper()
+
+
+def _array_to_tensor(arr: np.ndarray, device: torch.device) -> torch.Tensor:
+ """Convert a HxWxC float[0..1] array to a 1xCxHxW tensor on device."""
+ if arr.max() > 1.0:
+ arr = arr / 255.0
+ tensor = torch.from_numpy(arr).float().permute(2, 0, 1).unsqueeze(0).to(device)
+ return tensor
+
+
+def _run_paddle_ocr(image_path: str, ocr_kwargs: Optional[dict] = None):
+ if PaddleOCR is None:
+ raise RuntimeError("PaddleOCR is not installed. Please install paddleocr to use --ocr_mode.")
+ ocr_kwargs = ocr_kwargs or {}
+ # Always enable textline orientation if available
+ ocr_kwargs.setdefault("use_textline_orientation", True)
+ ocr = PaddleOCR(**ocr_kwargs)
+ try:
+ return ocr.predict(image_path)
+ except Exception:
+ # Fallback for older builds
+ return ocr.ocr(image_path)
+
+
+def _run_doctr_dbnet_detect(image_path: str, det_arch: str = "db_resnet50") -> list[list[list[int]]]:
+ """Run docTR DBNet detection and return list of polygons [[x,y],...].
+ Coordinates returned are absolute pixel values.
+ """
+ if doctr_ocr_predictor is None or DocumentFile is None:
+ raise RuntimeError("python-doctr is not installed. pip install python-doctr")
+ try:
+ predictor = doctr_ocr_predictor(det_arch=det_arch, reco_arch="crnn_vgg16_bn", pretrained=True)
+ doc = DocumentFile.from_images([image_path])
+ result = predictor(doc)
+ # docTR uses relative geometry [ (x0,y0), ... ] in [0,1]
+ im = Image.open(image_path)
+ W, H = im.size
+ polys: list[list[list[int]]] = []
+ # Try multiple result shapes robustly
+ try:
+ pages = getattr(result, 'pages', None)
+ if pages and len(pages) > 0:
+ page = pages[0]
+ # Walk blocks → lines → words
+ for blk in getattr(page, 'blocks', []) or []:
+ for ln in getattr(blk, 'lines', []) or []:
+ for wd in getattr(ln, 'words', []) or []:
+ g = getattr(wd, 'geometry', None)
+ if g is None:
+ continue
+ gl = list(g)
+ if len(gl) == 2:
+ # geometry provided as (xmin,ymin),(xmax,ymax)
+ (xmin, ymin), (xmax, ymax) = gl
+ pts = [
+ [int(round(xmin * W)), int(round(ymin * H))],
+ [int(round(xmax * W)), int(round(ymin * H))],
+ [int(round(xmax * W)), int(round(ymax * H))],
+ [int(round(xmin * W)), int(round(ymax * H))],
+ ]
+ else:
+ pts = [[int(round(x * W)), int(round(y * H))] for (x, y) in gl]
+ if len(pts) >= 3:
+ polys.append(pts)
+ except Exception:
+ pass
+ try:
+ # 2) Export dict may have 'boxes'
+ if not polys:
+ exp = result.export() if hasattr(result, 'export') else None
+ if isinstance(exp, dict):
+ pages = exp.get('pages') or []
+ if pages:
+ p0 = pages[0]
+ boxes = p0.get('boxes') or (p0.get('predictions', {}) or {}).get('boxes')
+ if boxes:
+ for (xmin, ymin, xmax, ymax) in list(boxes):
+ poly = [
+ [int(round(xmin * W)), int(round(ymin * H))],
+ [int(round(xmax * W)), int(round(ymin * H))],
+ [int(round(xmax * W)), int(round(ymax * H))],
+ [int(round(xmin * W)), int(round(ymax * H))],
+ ]
+ polys.append(poly)
+ except Exception:
+ pass
+ try:
+ # 3) Older exports list of polygons [[(x,y)...]]
+ if not polys:
+ pages = getattr(result, 'pages', None)
+ if pages and len(pages) > 0:
+ page = pages[0]
+ if hasattr(page, 'export'):
+ pexp = page.export()
+ # Look for nested geometries
+ for blk in pexp.get('blocks', []) or []:
+ for ln in blk.get('lines', []) or []:
+ for wd in ln.get('words', []) or []:
+ g = wd.get('geometry')
+ if g is None:
+ continue
+ gl = list(g)
+ if len(gl) == 2:
+ (xmin, ymin), (xmax, ymax) = gl
+ pts = [
+ [int(round(xmin * W)), int(round(ymin * H))],
+ [int(round(xmax * W)), int(round(ymin * H))],
+ [int(round(xmax * W)), int(round(ymax * H))],
+ [int(round(xmin * W)), int(round(ymax * H))],
+ ]
+ else:
+ pts = [[int(round(x * W)), int(round(y * H))] for (x, y) in gl]
+ if len(pts) >= 3:
+ polys.append(pts)
+ except Exception:
+ pass
+ return polys
+ except Exception:
+ return []
+
+
+def _extract_text_conf_from_result(result: object) -> list[tuple[str, float]]:
+ """Normalize PaddleOCR result to a flat list of (text, conf)."""
+ items: list[tuple[str, float]] = []
+ try:
+ if isinstance(result, dict):
+ texts = result.get("rec_texts") or result.get("rec_text") or result.get("texts") or []
+ scores = result.get("rec_scores") or result.get("scores") or []
+ if isinstance(texts, str):
+ texts = [texts]
+ if not hasattr(scores, "__len__"):
+ scores = [None] * len(texts)
+ for i, t in enumerate(list(texts)):
+ sc = 0.0
+ try:
+ sc = float(scores[i]) if i < len(scores) and scores[i] is not None else 0.0
+ except Exception:
+ sc = 0.0
+ items.append((str(t), sc))
+ return items
+
+ if isinstance(result, list):
+ if len(result) == 1 and isinstance(result[0], dict):
+ return _extract_text_conf_from_result(result[0])
+ # Legacy nested [[[poly], (text, score)], ...]
+ if len(result) > 0 and isinstance(result[0], list) and len(result[0]) > 0 and isinstance(result[0][0], list):
+ for line in result[0]:
+ if not isinstance(line, (list, tuple)) or len(line) < 2:
+ continue
+ payload = line[1]
+ text = ""
+ score = 0.0
+ if isinstance(payload, (list, tuple)) and len(payload) >= 1:
+ text = str(payload[0])
+ if len(payload) >= 2:
+ try:
+ score = float(payload[1])
+ except Exception:
+ score = 0.0
+ elif isinstance(payload, str):
+ text = payload
+ if len(line) >= 3:
+ try:
+ score = float(line[2])
+ except Exception:
+ score = 0.0
+ items.append((text, score))
+ return items
+ # Dict list
+ if len(result) > 0 and isinstance(result[0], dict):
+ for obj in result:
+ text = obj.get("text") or obj.get("rec_text") or ""
+ score = obj.get("score") or obj.get("rec_score") or 0.0
+ try:
+ score = float(score)
+ except Exception:
+ score = 0.0
+ items.append((str(text), float(score)))
+ return items
+ # Flat list [poly, text, score]
+ for line in result:
+ if not isinstance(line, (list, tuple)) or len(line) < 2:
+ continue
+ text = str(line[1])
+ score = 0.0
+ if len(line) >= 3:
+ try:
+ score = float(line[2])
+ except Exception:
+ score = 0.0
+ items.append((text, score))
+ return items
+ except Exception:
+ return items
+ return items
+
+
+def _extract_items_with_boxes(result: object) -> list[dict]:
+ """Return a list of {'text': str, 'score': float, 'box': [[x,y], ...]}.
+ Tries to support multiple PaddleOCR output shapes.
+ """
+ def to_pts(poly) -> list[list[int]]:
+ try:
+ if poly is None:
+ return []
+ if hasattr(poly, "tolist"):
+ poly = poly.tolist()
+ return [[int(p[0]), int(p[1])] for p in list(poly)]
+ except Exception:
+ return []
+
+ items: list[dict] = []
+ try:
+ if isinstance(result, dict):
+ texts = result.get("rec_texts") or result.get("texts") or []
+ scores = result.get("rec_scores") or result.get("scores") or []
+ boxes = result.get("dt_polys") or result.get("det_polygons") or result.get("boxes") or []
+ if isinstance(texts, str):
+ texts = [texts]
+ if not hasattr(scores, "__len__"):
+ scores = [None] * len(texts)
+ n = len(texts)
+ for i in range(n):
+ txt = str(texts[i])
+ try:
+ sc = float(scores[i]) if i < len(scores) and scores[i] is not None else 0.0
+ except Exception:
+ sc = 0.0
+ bx = to_pts(boxes[i]) if i < len(boxes) else []
+ items.append({"text": txt, "score": sc, "box": bx})
+ return items
+ if isinstance(result, list):
+ if len(result) == 1 and isinstance(result[0], dict):
+ return _extract_items_with_boxes(result[0])
+ if len(result) > 0 and isinstance(result[0], list) and len(result[0]) > 0 and isinstance(result[0][0], list):
+ for line in result[0]:
+ if not isinstance(line, (list, tuple)) or len(line) < 2:
+ continue
+ box = to_pts(line[0])
+ payload = line[1]
+ text = ""
+ score = 0.0
+ if isinstance(payload, (list, tuple)) and len(payload) >= 1:
+ text = str(payload[0])
+ if len(payload) >= 2:
+ try:
+ score = float(payload[1])
+ except Exception:
+ score = 0.0
+ elif isinstance(payload, str):
+ text = payload
+ if len(line) >= 3:
+ try:
+ score = float(line[2])
+ except Exception:
+ score = 0.0
+ items.append({"text": text, "score": score, "box": box})
+ return items
+ if len(result) > 0 and isinstance(result[0], dict):
+ for obj in result:
+ text = obj.get("text") or obj.get("rec_text") or ""
+ score = obj.get("score") or obj.get("rec_score") or 0.0
+ box = obj.get("box") or obj.get("poly") or obj.get("det_polygons") or []
+ try:
+ score = float(score)
+ except Exception:
+ score = 0.0
+ items.append({"text": str(text), "score": float(score), "box": to_pts(box)})
+ return items
+ except Exception:
+ return items
+ return items
+
+
+def _bbox_center(box: list[list[int]]) -> tuple[float, float]:
+ xs = [p[0] for p in box]
+ ys = [p[1] for p in box]
+ return (float(sum(xs)) / max(len(xs), 1), float(sum(ys)) / max(len(ys), 1))
+
+
+def _distance(p1: tuple[float, float], p2: tuple[float, float]) -> float:
+ return ((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2) ** 0.5
+
+
+def _expand_box(box: list[list[int]], img_w: int, img_h: int, pad_ratio: float = 0.15) -> tuple[int, int, int, int]:
+ xs = [p[0] for p in box]
+ ys = [p[1] for p in box]
+ x0, y0, x1, y1 = min(xs), min(ys), max(xs), max(ys)
+ w = x1 - x0
+ h = y1 - y0
+ pad_x = int(w * pad_ratio)
+ pad_y = int(h * pad_ratio)
+ x0 = max(0, x0 - pad_x)
+ y0 = max(0, y0 - pad_y)
+ x1 = min(img_w, x1 + pad_x)
+ y1 = min(img_h, y1 + pad_y)
+ return x0, y0, x1, y1
+
+
+def _poly_to_rect(box: list[list[int]]) -> list[int]:
+ if not box:
+ return [0, 0, 0, 0]
+ xs = [p[0] for p in box]
+ ys = [p[1] for p in box]
+ return [int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))]
+
+
+def _pil_to_tensor(img: Image.Image, device: torch.device, image_height: int = 256, image_width: int = 256) -> torch.Tensor:
+ arr = T.ToTensor()(T.Resize((image_height, image_width))(img.convert("RGB")))
+ return arr.unsqueeze(0).to(device)
+
+
+def _run_local_edit(pipeline: GaMuSA, crop_img: Image.Image, old_text: str, new_text: str,
+ starting_layer: int, num_inference_steps: int, guidance_scale: float,
+ device: torch.device) -> Image.Image:
+ source_tensor = _pil_to_tensor(crop_img, device)
+ style_tensor = _pil_to_tensor(crop_img, device)
+ result = text_editing(
+ pipeline,
+ source_tensor,
+ style_tensor,
+ old_text,
+ new_text,
+ starting_layer=starting_layer,
+ ddim_steps=num_inference_steps,
+ scale=guidance_scale,
+ )
+ _, out_np = result[:]
+ out_img = Image.fromarray((out_np * 255).astype(np.uint8))
+ return out_img
+
+
+def _match_rules_to_regions(ocr_items: list[dict], rules: dict[str, str]) -> list[dict]:
+ """Return a list of {old_text, new_text, box} for localized edits.
+ Uses label anchors like NOM/SURNAME and PRENOMS/GIVEN NAMES to choose nearby values.
+ """
+ # Build anchors
+ def matches_any(text: str, keywords: list[str]) -> bool:
+ nt = _sanitize(text)
+ return any(k in nt for k in keywords)
+
+ anchors: dict[str, list[dict]] = {
+ "NOM": [],
+ "GIVEN_NAMES": [],
+ }
+ for it in ocr_items:
+ t = it.get("text", "")
+ if matches_any(t, ["NOM", "SURNAME"]):
+ anchors["NOM"].append(it)
+ if matches_any(t, ["PRENOM", "GIVEN", "GIVENNAMES", "PRENOMS", "GIVEN NAMES"]):
+ anchors["GIVEN_NAMES"].append(it)
+
+ # Candidates for values
+ def is_surname_candidate(t: str) -> bool:
+ nt = _sanitize(t)
+ if any(k in nt for k in ["NOM", "SURNAME", "PRENOM", "GIVEN", "GIVENNAMES", "PRENOMS"]):
+ return False
+ return len(nt) >= 3 and nt == nt.upper() and re.fullmatch(r"[A-Z\-\s]+", nt or "") is not None
+
+ def is_given_candidate(t: str) -> bool:
+ nt = _sanitize(t)
+ if any(k in nt for k in ["NOM", "SURNAME", "PRENOM", "GIVEN", "GIVENNAMES", "PRENOMS"]):
+ return False
+ # mixed or lowercase with separators (avoid all-caps)
+ return any(c.islower() for c in t) or ("-" in t or "," in t)
+
+ tasks: list[dict] = []
+ for key, new_val in rules.items():
+ key_u = key.strip().upper()
+ if key_u not in ("NOM", "GIVEN_NAMES"):
+ continue
+ anchor_list = anchors.get(key_u, [])
+ best_item = None
+ best_dist = 1e9
+ for it in ocr_items:
+ t = it.get("text", "")
+ if key_u == "NOM" and not is_surname_candidate(t):
+ continue
+ if key_u == "GIVEN_NAMES" and not is_given_candidate(t):
+ continue
+ # compute distance to nearest anchor if exists; otherwise prefer high score
+ dist = 0.0
+ if anchor_list:
+ c = _bbox_center(it.get("box") or [])
+ dists = []
+ for an in anchor_list:
+ dists.append(_distance(c, _bbox_center(an.get("box") or [])))
+ dist = min(dists) if dists else 0.0
+ else:
+ dist = 0.0
+ if dist < best_dist:
+ best_dist = dist
+ best_item = it
+ if best_item is not None and best_item.get("box"):
+ tasks.append({"old_text": best_item["text"], "new_text": new_val, "box": best_item["box"]})
+ return tasks
+
+
+def _build_form_entries(ocr_items: list[dict]) -> list[dict]:
+ """Build a DocVQA-like form list with question/answer/other and linking.
+ We focus on key pairs: NOM ↔ value; GIVEN NAMES ↔ value.
+ """
+ def matches_any(text: str, keywords: list[str]) -> bool:
+ nt = _sanitize(text)
+ return any(k in nt for k in keywords)
+
+ items = []
+ for it in ocr_items:
+ items.append({
+ "text": it.get("text", ""),
+ "rect": _poly_to_rect(it.get("box") or []),
+ "score": float(it.get("score") or 0.0),
+ "box": it.get("box") or [],
+ })
+
+ questions_idx: dict[str, list[int]] = {"NOM": [], "GIVEN_NAMES": []}
+ for idx, obj in enumerate(items):
+ t = obj["text"]
+ if matches_any(t, ["NOM", "SURNAME"]):
+ questions_idx["NOM"].append(idx)
+ if matches_any(t, ["PRENOM", "GIVEN", "GIVENNAMES"]):
+ questions_idx["GIVEN_NAMES"].append(idx)
+
+ # find nearest answers
+ def pick_answer(candidates: list[int], predicate) -> Optional[int]:
+ best = None
+ bestd = 1e9
+ # candidate list of value boxes
+ value_ids = [i for i in range(len(items)) if predicate(items[i]["text"])]
+ for qi in candidates:
+ qc = _bbox_center(items[qi]["box"]) if items[qi]["box"] else (0.0, 0.0)
+ for vi in value_ids:
+ vc = _bbox_center(items[vi]["box"]) if items[vi]["box"] else (0.0, 0.0)
+ d = _distance(qc, vc)
+ if d < bestd:
+ bestd = d
+ best = (qi, vi)
+ if best is None:
+ return None
+ return best[1]
+
+ def is_surname_candidate(t: str) -> bool:
+ nt = _sanitize(t)
+ return len(nt) >= 3 and nt == nt.upper() and re.fullmatch(r"[A-Z\-\s]+", nt or "") is not None
+
+ def is_given_candidate(t: str) -> bool:
+ return any(c.islower() for c in t) or ("-" in t or "," in t)
+
+ links: list[tuple[int, int]] = []
+ if questions_idx["NOM"]:
+ ans = pick_answer(questions_idx["NOM"], is_surname_candidate)
+ if ans is not None:
+ links.append((questions_idx["NOM"][0], ans))
+ if questions_idx["GIVEN_NAMES"]:
+ ans = pick_answer(questions_idx["GIVEN_NAMES"], is_given_candidate)
+ if ans is not None:
+ links.append((questions_idx["GIVEN_NAMES"][0], ans))
+
+ # Build form list with ids and labels
+ form: list[dict] = []
+ id_map: dict[int, int] = {}
+ next_id = 0
+ question_set = set(q for q, _ in links)
+ answer_set = set(a for _, a in links)
+ for i, obj in enumerate(items):
+ label = "other"
+ if i in question_set:
+ label = "question"
+ elif i in answer_set:
+ label = "answer"
+ entry = {
+ "box": obj["rect"],
+ "text": obj["text"],
+ "label": label,
+ "words": [{"box": obj["rect"], "text": obj["text"]}],
+ "linking": [],
+ "id": next_id,
+ }
+ id_map[i] = next_id
+ next_id += 1
+ form.append(entry)
+
+ # apply linking
+ for q_i, a_i in links:
+ if q_i in id_map and a_i in id_map:
+ form[id_map[q_i]]["linking"].append([id_map[q_i], id_map[a_i]])
+ form[id_map[a_i]]["linking"].append([id_map[q_i], id_map[a_i]])
+
+ return form
+
+
+def _pick_field_values(ocr_result):
+ """Heuristic extraction of key fields from OCR output.
+
+ Returns a dict with potential values for keys: NOM, GIVEN_NAMES.
+ """
+ # Normalize OCR results
+ texts = _extract_text_conf_from_result(ocr_result)
+
+ # Candidates
+ blacklist = {
+ "REPUBLIQUE FRANCAISE",
+ "IDENTITY CARD",
+ "IDENTITY",
+ "FR",
+ "FRA",
+ "PARIS",
+ "SIGNATURE",
+ "DATE DE NAISS",
+ "DATE D EXPIR",
+ "NUM DU DOCUMENT",
+ "NOM D USAGE",
+ "NATIONALITE",
+ "LIEU DE NAISSANCE",
+ "SEXE",
+ "SURNAME",
+ "IDENTITYCARD",
+ }
+
+ cleaned = [(_sanitize(t), t, c) for t, c in texts]
+
+ # Heuristic for surname (NOM): all-uppercase word, not in blacklist
+ surname_candidates = []
+ for norm, original, conf in cleaned:
+ if len(norm) >= 3 and norm == norm.upper() and re.fullmatch(r"[A-Z\-\s]+", norm or ""):
+ if norm not in blacklist and not re.search(r"\b(FR|FRA|PARIS)\b", norm):
+ surname_candidates.append((conf, original))
+ surname = max(surname_candidates, default=(0.0, ""))[1]
+
+ # Heuristic for given names: contains at least one lowercase in original and long enough
+ given_candidates = []
+ for norm, original, conf in cleaned:
+ if any(ch.islower() for ch in original) and len(original) >= 5:
+ if "/" not in original and not original.isdigit():
+ given_candidates.append((conf, original))
+ # Prefer the longest, then highest confidence
+ if given_candidates:
+ given_candidates = sorted(given_candidates, key=lambda x: (len(x[1]), x[0]), reverse=True)
+ given_names = given_candidates[0][1]
+ else:
+ given_names = ""
+
+ return {"NOM": surname.strip(), "GIVEN_NAMES": given_names.strip()}
+
+
+def _load_rules(path: Optional[str]):
+ if path is None or not os.path.exists(path):
+ # Demo defaults
+ return {"NOM": "NGUYEN", "GIVEN_NAMES": "AndrewNguyen"}
+ with open(path, "r", encoding="utf-8") as f:
+ data = json.load(f)
+ # Normalize keys
+ norm = {k.strip().upper(): v for k, v in data.items()}
+ return norm
+
+
+def main(opt):
+ cfg_path = 'configs/inference.yaml'
+ model = create_model(cfg_path).cuda()
+ model.load_state_dict(load_state_dict(opt.ckpt_path), strict=False)
+ model.eval()
+
+ # Common pipeline pieces
+ monitor_cfg = {
+ "max_length": 25,
+ "loss_weight": 1.,
+ "attention": 'position',
+ "backbone": 'transformer',
+ "backbone_ln": 3,
+ "checkpoint": "weights/vision_model.pth",
+ "charset_path": "src/module/abinet/data/charset_36.txt"
+ }
+ pipeline = GaMuSA(model, monitor_cfg)
+ output_dir = opt.output_dir
+ os.makedirs(output_dir, exist_ok=True)
+ seed = opt.seed
+ starting_layer = opt.starting_layer
+ guidance_scale = opt.guidance_scale
+ num_sample_per_image = opt.num_sample_per_image
+ num_inference_steps = opt.num_inference_steps
+ seed_everything(seed)
+
+ if opt.ocr_mode:
+ if opt.input_image is None:
+ raise ValueError("--input_image is required when --ocr_mode is enabled")
+ image_path = opt.input_image
+ image_name = os.path.basename(image_path)
+ w, h = Image.open(image_path).size
+
+ # OCR detection and rule mapping
+ # Build OCR options based on CLI
+ ocr_opts = {"lang": getattr(opt, "ocr_lang", "fr")}
+ if getattr(opt, "ocr_version", ""):
+ ocr_opts["ocr_version"] = opt.ocr_version
+ if getattr(opt, "ocr_det_model", ""):
+ ocr_opts["text_detection_model_name"] = opt.ocr_det_model
+ if getattr(opt, "ocr_rec_model", ""):
+ ocr_opts["text_recognition_model_name"] = opt.ocr_rec_model
+ # Choose detector backend
+ if getattr(opt, 'detector_backend', 'paddle') == 'doctr':
+ # 1) detect with docTR DBNet
+ dbnet_polys = _run_doctr_dbnet_detect(image_path, getattr(opt, 'doctr_det_arch', 'db_resnet50'))
+ # 2) recognize with PaddleOCR on crops
+ # Create Paddle recognizer instance (rec-only if supported)
+ rec_kwargs = {"lang": ocr_opts.get("lang", "fr")}
+ if ocr_opts.get("text_recognition_model_name"):
+ rec_kwargs["text_recognition_model_name"] = ocr_opts["text_recognition_model_name"]
+ rec_ocr = PaddleOCR(**rec_kwargs) if PaddleOCR is not None else None
+ img = Image.open(image_path).convert("RGB")
+ items = []
+ for poly in dbnet_polys:
+ rect = _poly_to_rect(poly)
+ x0, y0, x1, y1 = rect
+ crop = img.crop((x0, y0, x1, y1))
+ text_val = ""
+ score_val = 0.0
+ if rec_ocr is not None:
+ try:
+ try:
+ rec_res = rec_ocr.predict(np.array(crop))
+ except Exception:
+ rec_res = rec_ocr.ocr(np.array(crop))
+ # extract best text
+ flat = _extract_text_conf_from_result(rec_res)
+ if flat:
+ text_val, score_val = flat[0]
+ except Exception:
+ pass
+ items.append({"text": text_val, "score": score_val, "box": poly})
+ ocr_items = items
+ # For anchors, we still need global text; run a lightweight OCR on full image to find labels
+ full_res = _run_paddle_ocr(image_path, ocr_opts)
+ detected_values = _pick_field_values(full_res)
+ # Save raw OCR like before
+ ocr_result = {
+ "det_polygons": dbnet_polys,
+ "items": items,
+ }
+ else:
+ ocr_result = _run_paddle_ocr(image_path, ocr_opts)
+ detected_values = _pick_field_values(ocr_result)
+ # Extract full items with boxes for logging/analysis
+ ocr_items = _extract_items_with_boxes(ocr_result)
+ # Save OCR JSON
+ base = os.path.splitext(image_name)[0]
+ ocr_json_path = os.path.join(output_dir, f"{base}_ocr.json")
+ with open(ocr_json_path, "w", encoding="utf-8") as jf:
+ json.dump({
+ "image": os.path.abspath(image_path),
+ "items": ocr_items,
+ "detected_fields": detected_values,
+ }, jf, ensure_ascii=False, indent=2)
+ # Build DocVQA-like form and save
+ form_entries = _build_form_entries(ocr_items)
+ form_json_path = os.path.join(output_dir, f"{base}_form.json")
+ with open(form_json_path, "w", encoding="utf-8") as fj:
+ json.dump({"form": form_entries}, fj, ensure_ascii=False, indent=2)
+
+ # Also save a quick overlay visualization
+ try:
+ im = Image.open(image_path).convert("RGB")
+ draw = ImageDraw.Draw(im)
+ for it in ocr_items:
+ pts = it.get("box") or []
+ if len(pts) >= 3:
+ poly = [tuple(p) for p in pts]
+ draw.polygon(poly, outline=(255, 0, 0), width=2)
+ im.save(os.path.join(output_dir, f"{base}_ocr.png"))
+ except Exception:
+ pass
+ rules = _load_rules(opt.rules_json)
+
+ # Prepare initial tensors
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+ current_tensor = load_image(image_path)
+
+ # Determine localized edit regions for rules
+ tasks = _match_rules_to_regions(ocr_items, rules)
+ # If heuristics failed to link, force simple fallbacks using detected_values
+ if not tasks:
+ fallback = []
+ # choose the closest all-caps box to NOM label
+ for key_u, new_val in (("NOM", rules.get("NOM")), ("GIVEN_NAMES", rules.get("GIVEN_NAMES"))):
+ if not new_val:
+ continue
+ # rough selection by text similarity
+ for it in ocr_items:
+ t = it.get("text", "")
+ if key_u == "NOM" and t and t.isupper():
+ fallback.append({"old_text": t, "new_text": new_val, "box": it.get("box")})
+ break
+ if key_u == "GIVEN_NAMES" and t and (any(c.islower() for c in t) or ("-" in t or "," in t)):
+ fallback.append({"old_text": t, "new_text": new_val, "box": it.get("box")})
+ break
+ if fallback:
+ tasks = fallback
+ # Apply localized edits: crop → edit → paste back
+ base_img = Image.open(image_path).convert("RGB")
+ img_w, img_h = base_img.size
+ composed = base_img.copy()
+ for task in tasks:
+ box = task.get("box") or []
+ if len(box) < 3:
+ continue
+ x0, y0, x1, y1 = _expand_box(box, img_w, img_h, pad_ratio=0.15)
+ crop = base_img.crop((x0, y0, x1, y1))
+ edited_crop = _run_local_edit(
+ pipeline,
+ crop,
+ task["old_text"],
+ task["new_text"],
+ starting_layer,
+ num_inference_steps,
+ guidance_scale,
+ device,
+ )
+ edited_crop = edited_crop.resize((x1 - x0, y1 - y0))
+ composed.paste(edited_crop, (x0, y0))
+
+ # Save result
+ composed.save(os.path.join(output_dir, image_name))
+ return
+
+ # Default dataset-driven flow (original behavior)
+ dataset_dir = opt.dataset_dir
+ style_dir = os.path.join(dataset_dir, 'i_s')
+ style_images_path = {image_name: os.path.join(style_dir, image_name) for image_name in os.listdir(style_dir)}
+ style_txt = os.path.join(dataset_dir, 'i_s.txt')
+ style_dict = {}
+ with open(style_txt, 'r') as f:
+ for line in f.readlines():
+ if line != '\n':
+ image_name, text = line.strip().split(' ')[:]
+ style_dict[image_name] = text
+ target_txt = os.path.join(dataset_dir, 'i_t.txt')
+ target_dict = {}
+ with open(target_txt, 'r') as f:
+ for line in f.readlines():
+ if line != '\n':
+ image_name, text = line.strip().split(' ')[:]
+ target_dict[image_name] = text
+
+ for i in tqdm(range(len(list(style_images_path.keys())))):
+ image_name = list(style_images_path.keys())[i]
+ image_path = style_images_path[image_name]
+ style_text = style_dict[image_name]
+ target_text = target_dict[image_name]
+ w, h = Image.open(image_path).size
+ source_image = load_image(image_path)
+ style_image = load_image(image_path)
+ result = text_editing(
+ pipeline,
+ source_image,
+ style_image,
+ style_text,
+ target_text,
+ starting_layer=starting_layer,
+ ddim_steps=num_inference_steps,
+ scale=guidance_scale,
+ )
+ if opt.benchmark:
+ _, GaMuSA_image = result[:]
+ GaMuSA_image = Image.fromarray((GaMuSA_image * 255).astype(np.uint8)).resize((w, h))
+ GaMuSA_image.save(os.path.join(output_dir, image_name))
+
+ else:
+ save_dir = os.path.join(output_dir, image_name.split('.')[0])
+ os.makedirs(save_dir, exist_ok=True)
+ reconstruction_image, GaMuSA_image = result[:]
+ reconstruction_image = Image.fromarray((reconstruction_image * 255).astype(np.uint8)).resize((w, h))
+ GaMuSA_image = Image.fromarray((GaMuSA_image * 255).astype(np.uint8)).resize((w, h))
+ reconstruction_image.save(os.path.join(save_dir, 'recons_' + style_text + '.png'))
+ GaMuSA_image.save(os.path.join(save_dir, 'GaMUSA_' + target_text + '.png'))
+
+
+if __name__ == "__main__":
+ parser = create_parser()
+ opt = parser.parse_args()
+ main(opt)
diff --git a/src/model/TextGen/modify/Synthtext/gen.py b/src/model/TextGen/modify/Synthtext/gen.py
new file mode 100644
index 0000000..f6b26ae
--- /dev/null
+++ b/src/model/TextGen/modify/Synthtext/gen.py
@@ -0,0 +1,284 @@
+# -*- coding: utf-8 -*-
+"""
+SRNet data generator.
+Copyright (c) 2019 Netease Youdao Information Technology Co.,Ltd.
+Licensed under the GPL License
+Written by Yu Qian
+"""
+
+import os
+import cv2
+import math
+import numpy as np
+import pygame
+from pygame import freetype
+import random
+import multiprocessing
+import queue
+import Augmentor
+
+from . import render_text_mask
+from . import colorize
+from . import skeletonization
+from . import render_standard_text
+from . import data_cfg
+import pickle as cp
+
+class datagen():
+
+ def __init__(self):
+ freetype.init()
+ # cur_file_path = os.path.dirname(__file__)
+ font_dir = data_cfg.font_dir
+ self.font_list = os.listdir(font_dir)
+ self.font_list = [os.path.join(font_dir, font_name) for font_name in self.font_list]
+ self.standard_font_path = data_cfg.standard_font_path
+
+ color_filepath = data_cfg.color_filepath
+ self.colorsRGB, self.colorsLAB = colorize.get_color_matrix(color_filepath)
+
+ text_filepath = data_cfg.text_filepath
+ self.text_list = open(text_filepath, 'r').readlines()
+ self.text_list = [text.strip() for text in self.text_list]
+
+ # bg_filepath = os.path.join(cur_file_path, data_cfg.bg_filepath)
+ self.bg_list = open(data_cfg.bg_filepath, 'r').readlines()
+ self.bg_list = [img_path.strip() for img_path in self.bg_list]
+
+ self.surf_augmentor = Augmentor.DataPipeline(None)
+ self.surf_augmentor.random_distortion(probability = data_cfg.elastic_rate,
+ grid_width = data_cfg.elastic_grid_size, grid_height = data_cfg.elastic_grid_size,
+ magnitude = data_cfg.elastic_magnitude)
+
+ self.bg_augmentor = Augmentor.DataPipeline(None)
+ self.bg_augmentor.random_brightness(probability = data_cfg.brightness_rate,
+ min_factor = data_cfg.brightness_min, max_factor = data_cfg.brightness_max)
+ self.bg_augmentor.random_color(probability = data_cfg.color_rate,
+ min_factor = data_cfg.color_min, max_factor = data_cfg.color_max)
+ self.bg_augmentor.random_contrast(probability = data_cfg.contrast_rate,
+ min_factor = data_cfg.contrast_min, max_factor = data_cfg.contrast_max)
+ def gen_srnet_data_with_background(self):
+
+ while True:
+ # choose font, text and bg
+ font = np.random.choice(self.font_list)
+ font_name = font
+ text1, text2 = np.random.choice(self.text_list), np.random.choice(self.text_list)
+
+ upper_rand = np.random.rand()
+ if upper_rand < data_cfg.capitalize_rate + data_cfg.uppercase_rate:
+ text1, text2 = text1.capitalize(), text2.capitalize()
+ if upper_rand < data_cfg.uppercase_rate:
+ text1, text2 = text1.upper(), text2.upper()
+
+ bg = cv2.imread(random.choice(self.bg_list))
+ # init font
+ font = freetype.Font(font)
+ font.antialiased = True
+ font.origin = True
+
+ # choose font style
+ font.size = np.random.randint(data_cfg.font_size[0], data_cfg.font_size[1] + 1)
+ font.underline = np.random.rand() < data_cfg.underline_rate
+ font.strong = np.random.rand() < data_cfg.strong_rate
+ font.oblique = np.random.rand() < data_cfg.oblique_rate
+
+ # render text to surf
+ param = {
+ 'is_curve': np.random.rand() < data_cfg.is_curve_rate,
+ 'curve_rate': data_cfg.curve_rate_param[0] * np.random.randn()
+ + data_cfg.curve_rate_param[1],
+ 'curve_center': np.random.randint(0, len(text1))
+ }
+ surf1, bbs1 = render_text_mask.render_text(font, text1, param)
+ param['curve_center'] = int(param['curve_center'] / len(text1) * len(text2))
+ surf2, bbs2 = render_text_mask.render_text(font, text2, param)
+
+ # get padding
+ padding_ud = np.random.randint(data_cfg.padding_ud[0], data_cfg.padding_ud[1] + 1, 2)
+ padding_lr = np.random.randint(data_cfg.padding_lr[0], data_cfg.padding_lr[1] + 1, 2)
+ padding = np.hstack((padding_ud, padding_lr))
+
+ # perspect the surf
+ rotate = data_cfg.rotate_param[0] * np.random.randn() + data_cfg.rotate_param[1]
+ zoom = data_cfg.zoom_param[0] * np.random.randn(2) + data_cfg.zoom_param[1]
+ shear = data_cfg.shear_param[0] * np.random.randn(2) + data_cfg.shear_param[1]
+ perspect = data_cfg.perspect_param[0] * np.random.randn(2) +data_cfg.perspect_param[1]
+
+ surf1 = render_text_mask.perspective(surf1, rotate, zoom, shear, perspect, padding) # w first
+ surf2 = render_text_mask.perspective(surf2, rotate, zoom, shear, perspect, padding) # w first
+
+ # choose a background
+ surf1_h, surf1_w = surf1.shape[:2]
+ surf2_h, surf2_w = surf2.shape[:2]
+ surf_h = max(surf1_h, surf2_h)
+ surf_w = max(surf1_w, surf2_w)
+ surf1 = render_text_mask.center2size(surf1, (surf_h, surf_w))
+ surf2 = render_text_mask.center2size(surf2, (surf_h, surf_w))
+
+ bg_h, bg_w = bg.shape[:2]
+ if bg_w < surf_w or bg_h < surf_h:
+ continue
+ x = np.random.randint(0, bg_w - surf_w + 1)
+ y = np.random.randint(0, bg_h - surf_h + 1)
+ t_b = bg[y:y+surf_h, x:x+surf_w, :]
+
+ # augment surf
+ surfs = [[surf1, surf2]]
+ self.surf_augmentor.augmentor_images = surfs
+ surf1, surf2 = self.surf_augmentor.sample(1)[0]
+
+ # bg augment
+ bgs = [[t_b]]
+ self.bg_augmentor.augmentor_images = bgs
+ t_b = self.bg_augmentor.sample(1)[0][0]
+
+ # render standard text
+ i_t = render_standard_text.make_standard_text(self.standard_font_path, text2, (surf_h, surf_w))
+
+ # get min h of bbs
+ min_h1 = np.min(bbs1[:, 3])
+ min_h2 = np.min(bbs2[:, 3])
+ min_h = min(min_h1, min_h2)
+ # get font color
+ if np.random.rand() < data_cfg.use_random_color_rate:
+ fg_col, bg_col = (np.random.rand(3) * 255.).astype(np.uint8), (np.random.rand(3) * 255.).astype(np.uint8)
+ else:
+ fg_col, bg_col = colorize.get_font_color(self.colorsRGB, self.colorsLAB, t_b)
+
+ # colorful the surf and conbine foreground and background
+ param = {
+ 'is_border': np.random.rand() < data_cfg.is_border_rate,
+ 'bordar_color': tuple(np.random.randint(0, 256, 3)),
+ 'is_shadow': np.random.rand() < data_cfg.is_shadow_rate,
+ 'shadow_angle': np.pi / 4 * np.random.choice(data_cfg.shadow_angle_degree)
+ + data_cfg.shadow_angle_param[0] * np.random.randn(),
+ 'shadow_shift': data_cfg.shadow_shift_param[0, :] * np.random.randn(3)
+ + data_cfg.shadow_shift_param[1, :],
+ 'shadow_opacity': data_cfg.shadow_opacity_param[0] * np.random.randn()
+ + data_cfg.shadow_opacity_param[1]
+ }
+ s_s, i_s = colorize.colorize(surf1, t_b, fg_col, bg_col, self.colorsRGB, self.colorsLAB, min_h, param)
+ t_t, t_f = colorize.colorize(surf2, t_b, fg_col, bg_col, self.colorsRGB, self.colorsLAB, min_h, param)
+ ha, wa = i_s.shape[:2]
+ if ha * wa < 1000:
+ continue
+ # skeletonization
+ t_sk = skeletonization.skeletonization(surf2, 127)
+ break
+
+ return {"data": [i_t, i_s, t_sk, t_t, t_b, t_f, s_s, surf2, surf1],
+ "is_text": text1,
+ "it_text": text2,
+ "font": font_name
+ }
+
+def enqueue_data(queue, capacity):
+ np.random.seed()
+ gen = datagen()
+ while True:
+ try:
+ data_dict = gen.gen_srnet_data_with_background()
+ except Exception as e:
+ # pass
+ continue
+ if queue.qsize() < capacity:
+ queue.put(data_dict)
+
+class multiprocess_datagen():
+
+ def __init__(self, process_num, data_capacity):
+
+ self.process_num = process_num
+ self.data_capacity = data_capacity
+
+ def multiprocess_runningqueue(self):
+
+ manager = multiprocessing.Manager()
+ self.queue = manager.Queue()
+ self.pool = multiprocessing.Pool(processes = self.process_num)
+ self.processes = []
+ for _ in range(self.process_num):
+ p = self.pool.apply_async(enqueue_data, args=(self.queue, self.data_capacity))
+ self.processes.append(p)
+ self.pool.close()
+
+ def dequeue_data(self):
+ np.random.seed()
+ while self.queue.empty():
+ pass
+ data_dict = self.queue.get()
+ return data_dict
+ '''
+ data = None
+ if not self.queue.empty():
+ data = self.queue.get()
+ return data
+ '''
+
+
+ def dequeue_batch(self, batch_size, data_shape):
+
+ while self.queue.qsize() < batch_size:
+ pass
+
+ i_t_batch, i_s_batch = [], []
+ t_sk_batch, t_t_batch, t_b_batch, t_f_batch = [], [], [], []
+ mask_t_batch = []
+
+ for i in range(batch_size):
+ i_t, i_s, t_sk, t_t, t_b, t_f, mask_t = self.dequeue_data()
+ i_t_batch.append(i_t)
+ i_s_batch.append(i_s)
+ t_sk_batch.append(t_sk)
+ t_t_batch.append(t_t)
+ t_b_batch.append(t_b)
+ t_f_batch.append(t_f)
+ mask_t_batch.append(mask_t)
+
+ w_sum = 0
+ for t_b in t_b_batch:
+ h, w = t_b.shape[:2]
+ scale_ratio = data_shape[0] / h
+ w_sum += int(w * scale_ratio)
+
+ to_h = data_shape[0]
+ to_w = w_sum // batch_size
+ to_w = int(round(to_w / 8)) * 8
+ to_size = (to_w, to_h) # w first for cv2
+ for i in range(batch_size):
+ i_t_batch[i] = cv2.resize(i_t_batch[i], to_size)
+ i_s_batch[i] = cv2.resize(i_s_batch[i], to_size)
+ t_sk_batch[i] = cv2.resize(t_sk_batch[i], to_size, interpolation=cv2.INTER_NEAREST)
+ t_t_batch[i] = cv2.resize(t_t_batch[i], to_size)
+ t_b_batch[i] = cv2.resize(t_b_batch[i], to_size)
+ t_f_batch[i] = cv2.resize(t_f_batch[i], to_size)
+ mask_t_batch[i] = cv2.resize(mask_t_batch[i], to_size, interpolation=cv2.INTER_NEAREST)
+ # eliminate the effect of resize on t_sk
+ t_sk_batch[i] = skeletonization.skeletonization(mask_t_batch[i], 127)
+
+ i_t_batch = np.stack(i_t_batch)
+ i_s_batch = np.stack(i_s_batch)
+ t_sk_batch = np.expand_dims(np.stack(t_sk_batch), axis = -1)
+ t_t_batch = np.stack(t_t_batch)
+ t_b_batch = np.stack(t_b_batch)
+ t_f_batch = np.stack(t_f_batch)
+ mask_t_batch = np.expand_dims(np.stack(mask_t_batch), axis = -1)
+
+ i_t_batch = i_t_batch.astype(np.float32) / 127.5 - 1.
+ i_s_batch = i_s_batch.astype(np.float32) / 127.5 - 1.
+ t_sk_batch = t_sk_batch.astype(np.float32) / 255.
+ t_t_batch = t_t_batch.astype(np.float32) / 127.5 - 1.
+ t_b_batch = t_b_batch.astype(np.float32) / 127.5 - 1.
+ t_f_batch = t_f_batch.astype(np.float32) / 127.5 - 1.
+ mask_t_batch = mask_t_batch.astype(np.float32) / 255.
+
+ return [i_t_batch, i_s_batch, t_sk_batch, t_t_batch, t_b_batch, t_f_batch, mask_t_batch]
+
+ def get_queue_size(self):
+
+ return self.queue.qsize()
+
+ def terminate_pool(self):
+
+ self.pool.terminate()
diff --git a/src/model/TextGen/modify/cfg.py b/src/model/TextGen/modify/cfg.py
new file mode 100644
index 0000000..dcb0319
--- /dev/null
+++ b/src/model/TextGen/modify/cfg.py
@@ -0,0 +1,27 @@
+"""
+Configurations of data generating.
+Copyright (c) 2019 Netease Youdao Information Technology Co.,Ltd.
+Licensed under the GPL License (see LICENSE for details)
+Written by Yu Qian
+"""
+
+# dir
+data_dir = '.../train-50k-1/'
+i_t_dir = 'i_t'
+i_s_dir = 'i_s'
+t_sk_dir = 't_sk'
+t_t_dir = 't_t'
+t_b_dir = 't_b'
+t_f_dir = 't_f'
+s_s_dir = 's_s'
+mask_t_dir = 'mask_t'
+mask_s_dir = 'mask_s'
+
+
+# sample
+sample_num = 50000
+
+# multiprocess
+process_num = 16
+data_capacity = 64
+
diff --git a/src/model/TextGen/modify/datagen.py b/src/model/TextGen/modify/datagen.py
new file mode 100644
index 0000000..5141824
--- /dev/null
+++ b/src/model/TextGen/modify/datagen.py
@@ -0,0 +1,80 @@
+"""
+Generating data for SRNet
+Copyright (c) 2019 Netease Youdao Information Technology Co.,Ltd.
+Licensed under the GPL License (see LICENSE for details)
+Written by Yu Qian
+"""
+
+import os
+import cv2
+import cfg
+from tqdm import tqdm
+from Synthtext.gen import datagen, multiprocess_datagen
+
+def makedirs(path):
+ if not os.path.exists(path):
+ os.makedirs(path)
+
+def main():
+
+ i_t_dir = os.path.join(cfg.data_dir, cfg.i_t_dir)
+ i_s_dir = os.path.join(cfg.data_dir, cfg.i_s_dir)
+ t_sk_dir = os.path.join(cfg.data_dir, cfg.t_sk_dir)
+ t_t_dir = os.path.join(cfg.data_dir, cfg.t_t_dir)
+ t_b_dir = os.path.join(cfg.data_dir, cfg.t_b_dir)
+ t_f_dir = os.path.join(cfg.data_dir, cfg.t_f_dir)
+ s_s_dir = os.path.join(cfg.data_dir, cfg.s_s_dir)
+ mask_t_dir = os.path.join(cfg.data_dir, cfg.mask_t_dir)
+ mask_s_dir = os.path.join(cfg.data_dir, cfg.mask_s_dir)
+
+ makedirs(i_t_dir)
+ makedirs(i_s_dir)
+ makedirs(t_sk_dir)
+ makedirs(t_t_dir)
+ makedirs(t_b_dir)
+ makedirs(t_f_dir)
+ makedirs(s_s_dir)
+ makedirs(mask_t_dir)
+ makedirs(mask_s_dir)
+
+ it_txt = open(os.path.join(cfg.data_dir, 'i_t.txt'), 'w')
+ is_txt = open(os.path.join(cfg.data_dir, 'i_s.txt'), 'w')
+ font_txt = open(os.path.join(cfg.data_dir, 'font.txt'), 'w')
+
+ mp_gen = multiprocess_datagen(cfg.process_num, cfg.data_capacity)
+ mp_gen.multiprocess_runningqueue()
+ digit_num = len(str(cfg.sample_num))
+ for idx in tqdm(range(cfg.sample_num)):
+ data_dict = mp_gen.dequeue_data()
+ i_t, i_s, t_sk, t_t, t_b, t_f, s_s, mask_t, mask_s =data_dict["data"]
+ is_text = data_dict["is_text"]
+ it_text = data_dict["it_text"]
+ font_name = data_dict["font"]
+ is_txt.write(str(idx).zfill(digit_num) + '.png' + ' ' + is_text + '\n')
+ it_txt.write(str(idx).zfill(digit_num) + '.png' + ' ' + it_text + '\n')
+ font_txt.write(str(idx).zfill(digit_num) + '.png' + ' ' + font_name + '\n')
+ i_t_path = os.path.join(i_t_dir, str(idx).zfill(digit_num) + '.png')
+ i_s_path = os.path.join(i_s_dir, str(idx).zfill(digit_num) + '.png')
+ t_sk_path = os.path.join(t_sk_dir, str(idx).zfill(digit_num) + '.png')
+ t_t_path = os.path.join(t_t_dir, str(idx).zfill(digit_num) + '.png')
+ t_b_path = os.path.join(t_b_dir, str(idx).zfill(digit_num) + '.png')
+ t_f_path = os.path.join(t_f_dir, str(idx).zfill(digit_num) + '.png')
+ s_s_path = os.path.join(s_s_dir, str(idx).zfill(digit_num) + '.png')
+ mask_t_path = os.path.join(cfg.data_dir, cfg.mask_t_dir, str(idx).zfill(digit_num) + '.png')
+ mask_s_path = os.path.join(cfg.data_dir, cfg.mask_s_dir, str(idx).zfill(digit_num) + '.png')
+ cv2.imwrite(i_t_path, i_t, [int(cv2.IMWRITE_PNG_COMPRESSION), 0])
+ cv2.imwrite(i_s_path, i_s, [int(cv2.IMWRITE_PNG_COMPRESSION), 0])
+ cv2.imwrite(t_sk_path, t_sk, [int(cv2.IMWRITE_PNG_COMPRESSION), 0])
+ cv2.imwrite(t_t_path, t_t, [int(cv2.IMWRITE_PNG_COMPRESSION), 0])
+ cv2.imwrite(t_b_path, t_b, [int(cv2.IMWRITE_PNG_COMPRESSION), 0])
+ cv2.imwrite(t_f_path, t_f, [int(cv2.IMWRITE_PNG_COMPRESSION), 0])
+ cv2.imwrite(s_s_path, s_s, [int(cv2.IMWRITE_PNG_COMPRESSION), 0])
+ cv2.imwrite(mask_t_path, mask_t, [int(cv2.IMWRITE_PNG_COMPRESSION), 0])
+ cv2.imwrite(mask_s_path, mask_s, [int(cv2.IMWRITE_PNG_COMPRESSION), 0])
+
+ mp_gen.terminate_pool()
+ is_txt.close()
+ it_txt.close()
+ font_txt.close()
+if __name__ == '__main__':
+ main()
diff --git a/src/model/TextGen/preglyph/.idea/.gitignore b/src/model/TextGen/preglyph/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/src/model/TextGen/preglyph/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/src/model/TextGen/preglyph/.idea/deployment.xml b/src/model/TextGen/preglyph/.idea/deployment.xml
new file mode 100644
index 0000000..fa2baaa
--- /dev/null
+++ b/src/model/TextGen/preglyph/.idea/deployment.xml
@@ -0,0 +1,56 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/model/TextGen/preglyph/.idea/inspectionProfiles/Project_Default.xml b/src/model/TextGen/preglyph/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..08ed659
--- /dev/null
+++ b/src/model/TextGen/preglyph/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,102 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/model/TextGen/preglyph/.idea/inspectionProfiles/profiles_settings.xml b/src/model/TextGen/preglyph/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/src/model/TextGen/preglyph/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/model/TextGen/preglyph/.idea/misc.xml b/src/model/TextGen/preglyph/.idea/misc.xml
new file mode 100644
index 0000000..a971a2c
--- /dev/null
+++ b/src/model/TextGen/preglyph/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/src/model/TextGen/preglyph/.idea/modules.xml b/src/model/TextGen/preglyph/.idea/modules.xml
new file mode 100644
index 0000000..ef10476
--- /dev/null
+++ b/src/model/TextGen/preglyph/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/model/TextGen/preglyph/.idea/preglyph.iml b/src/model/TextGen/preglyph/.idea/preglyph.iml
new file mode 100644
index 0000000..d0876a7
--- /dev/null
+++ b/src/model/TextGen/preglyph/.idea/preglyph.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/model/TextGen/preglyph/configs/GlyphTrain.yaml b/src/model/TextGen/preglyph/configs/GlyphTrain.yaml
new file mode 100644
index 0000000..6b2ff3d
--- /dev/null
+++ b/src/model/TextGen/preglyph/configs/GlyphTrain.yaml
@@ -0,0 +1,43 @@
+dataset:
+ target: dataloader.LabelDataset
+ params:
+ size: 224
+ length: 100000
+ font_path: ".../Syn_data/fonts/"
+ min_len: 1
+ max_len: 24
+
+model:
+ target: modules.LabelEncoder
+ params:
+ trainable: True
+ max_len: 24
+ emb_dim: 768
+ n_heads: 8
+ n_trans_layers: 12
+ lr: 1e-5
+
+ visual_config:
+ target: modules.ViTSTREncoder
+ params:
+ freeze: True
+ ckpt_path: ".../weights/vitstr_base_patch16_224.pth"
+ size: 224
+ patch_size: 16
+ embed_dim: 768
+ depth: 12
+ num_heads: 12
+ mlp_ratio: 4
+ qkv_bias: True
+ in_chans: 1
+
+
+num_workers: 8
+batch_size: 256
+check_freq: 10
+
+lightning:
+ max_epochs: 100
+ accelerator: "cuda"
+ devices: [0, 1, 2, 3]
+ default_root_dir: "./logs/pre_glyph_logs"
diff --git a/src/model/TextGen/preglyph/dataloader.py b/src/model/TextGen/preglyph/dataloader.py
new file mode 100644
index 0000000..2d140e6
--- /dev/null
+++ b/src/model/TextGen/preglyph/dataloader.py
@@ -0,0 +1,90 @@
+import os
+import random
+import string
+import torchvision.transforms as transforms
+import torch.utils.data as data
+from omegaconf import OmegaConf
+from PIL import Image, ImageDraw, ImageFont
+from random import choice, randint
+from utils import *
+
+class LabelDataset(data.Dataset):
+
+ def __init__(self, size, length, font_path, min_len, max_len) -> None:
+ super().__init__()
+
+ # constraint
+ self.length = length
+ self.size = size
+ self.font_dir = font_path
+
+ self.character = string.printable[:-6]
+ self.min_len = min_len
+ self.max_len = max_len
+
+ self.grayscale = transforms.Grayscale()
+ self.resize = transforms.Resize((self.size, self.size), transforms.InterpolationMode.BICUBIC, antialias=True)
+
+ self.words = []
+ with open("words.txt", 'r') as f:
+ lines = f.readlines()
+ for line in lines:
+ texts = line.strip().split(' ')
+ self.words += texts
+
+ def __len__(self):
+
+ return self.length
+
+ def __getitem__(self, index):
+
+ while True:
+
+ if random.random() < 0.5:
+ text_len = randint(self.min_len, self.max_len)
+ text = "".join([choice(self.character) for i in range(text_len)])
+ else:
+ text = random.choice(self.words)
+
+ try:
+ font = ImageFont.truetype(os.path.join(self.font_dir, choice(os.listdir(self.font_dir))), 128)
+
+ std_l, std_t, std_r, std_b = font.getbbox(text)
+ std_h, std_w = std_b - std_t, std_r - std_l
+ if std_h == 0 or std_w == 0:
+ continue
+ except:
+ continue
+
+ try:
+ image = Image.new('RGB', (std_w, std_h), color = (0,0,0))
+ draw = ImageDraw.Draw(image)
+ draw.text((0, 0), text, fill = (255,255,255), font=font, anchor="lt")
+ except:
+ continue
+
+
+ image = transforms.ToTensor()(image)
+ image = self.grayscale(image)
+ image = self.resize(image)
+
+ batch = {
+ "image": image,
+ "text": text
+ }
+
+ return batch
+
+
+
+
+
+def get_dataloader(cfgs, datype="train"):
+
+ dataset_cfgs = OmegaConf.load(cfgs.dataset_cfg_path)
+ print(f"Extracting data from {dataset_cfgs.target}")
+ Dataset = eval(dataset_cfgs.target)
+ dataset = Dataset(dataset_cfgs.params, datype = datype)
+
+ return data.DataLoader(dataset=dataset, batch_size=cfgs.batch_size, shuffle=cfgs.shuffle, num_workers=cfgs.num_workers, drop_last=True)
+
diff --git a/src/model/TextGen/preglyph/modules.py b/src/model/TextGen/preglyph/modules.py
new file mode 100644
index 0000000..a8f0042
--- /dev/null
+++ b/src/model/TextGen/preglyph/modules.py
@@ -0,0 +1,273 @@
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import math
+import string
+from utils import instantiate_from_config
+from torchvision import transforms
+from timm.models.vision_transformer import VisionTransformer
+import numpy as np
+import pytorch_lightning as pl
+
+def autocast(f, enabled=True):
+ def do_autocast(*args, **kwargs):
+ with torch.cuda.amp.autocast(
+ enabled=enabled,
+ dtype=torch.get_autocast_gpu_dtype(),
+ cache_enabled=torch.is_autocast_cache_enabled(),
+ ):
+ return f(*args, **kwargs)
+
+ return do_autocast
+class AbstractEmbModel(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self._is_trainable = None
+ self._ucg_rate = None
+ self._input_key = None
+ self._emb_key = None
+
+ @property
+ def is_trainable(self) -> bool:
+ return self._is_trainable
+
+ @property
+ def ucg_rate(self) -> Union[float, torch.Tensor]:
+ return self._ucg_rate
+
+ @property
+ def input_key(self) -> str:
+ return self._input_key
+
+ @property
+ def emb_key(self) -> str:
+ return self._emb_key
+
+ @is_trainable.setter
+ def is_trainable(self, value: bool):
+ self._is_trainable = value
+
+ @ucg_rate.setter
+ def ucg_rate(self, value: Union[float, torch.Tensor]):
+ self._ucg_rate = value
+
+ @input_key.setter
+ def input_key(self, value: str):
+ self._input_key = value
+
+ @emb_key.setter
+ def emb_key(self, value: str):
+ self._emb_key = value
+
+ @is_trainable.deleter
+ def is_trainable(self):
+ del self._is_trainable
+
+ @ucg_rate.deleter
+ def ucg_rate(self):
+ del self._ucg_rate
+
+ @input_key.deleter
+ def input_key(self):
+ del self._input_key
+
+ @emb_key.deleter
+ def emb_key(self):
+ del self._emb_key
+
+
+
+
+
+class PositionalEncoding(nn.Module):
+
+ def __init__(self, d_model, dropout=0.1, max_len=5000):
+ super(PositionalEncoding, self).__init__()
+
+ self.dropout = nn.Dropout(p=dropout)
+
+ pe = torch.zeros(max_len, d_model)
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+ pe[:, 0::2] = torch.sin(position * div_term)
+ pe[:, 1::2] = torch.cos(position * div_term)
+ self.register_buffer('pe', pe)
+
+ def forward(self, x):
+ x = x + torch.tile(self.pe[None, ...].to(x.device), (x.shape[0], 1, 1))
+ return self.dropout(x)
+
+
+class ViTSTREncoder(VisionTransformer):
+ '''
+ ViTSTREncoder is basically a ViT that uses ViTSTR weights
+ '''
+
+ def __init__(self, size=224, ckpt_path=None, freeze=True, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self.grayscale = transforms.Grayscale()
+ self.resize = transforms.Resize((size, size), transforms.InterpolationMode.BICUBIC, antialias=True)
+
+ self.character = string.printable[:-6]
+ self.reset_classifier(num_classes=len(self.character) + 2)
+
+ if ckpt_path is not None:
+ self.load_state_dict(torch.load(ckpt_path, map_location="cpu"), strict=False)
+
+ if freeze:
+ self.freeze()
+
+ def reset_classifier(self, num_classes):
+ self.num_classes = num_classes
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ def freeze(self):
+ for param in self.parameters():
+ param.requires_grad_(False)
+
+ def forward_features(self, x):
+ B = x.shape[0]
+ x = self.patch_embed(x)
+
+ cls_tokens = self.cls_token.expand(B, -1, -1)
+ x = torch.cat((cls_tokens, x), dim=1)
+ x = x + self.pos_embed
+ x = self.pos_drop(x)
+
+ for blk in self.blocks:
+ x = blk(x)
+
+ x = self.norm(x)
+ return x
+
+ def forward(self, x):
+
+ x = self.forward_features(x)
+
+ return x
+
+ def encode(self, x):
+ return self(x)
+
+
+class LabelEncoder(AbstractEmbModel, pl.LightningModule):
+
+ def __init__(self, max_len, emb_dim, n_heads=8, n_trans_layers=12, ckpt_path=None, trainable=False,
+ lr=1e-4, clip_dim=1024, visual_len=197, visual_dim=768,
+ visual_config=None, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self.max_len = max_len
+ self.emd_dim = emb_dim
+ self.n_heads = n_heads
+ self.n_trans_layers = n_trans_layers
+ self.character = string.printable[:-6]
+ self.num_cls = len(self.character) + 1
+
+ self.label_embedding = nn.Embedding(self.num_cls, self.emd_dim)
+ self.pos_embedding = PositionalEncoding(d_model=self.emd_dim, max_len=self.max_len)
+ transformer_block = nn.TransformerEncoderLayer(d_model=self.emd_dim, nhead=self.n_heads, batch_first=True)
+ self.encoder = nn.TransformerEncoder(transformer_block, num_layers=self.n_trans_layers)
+
+ if ckpt_path is not None:
+ self.load_state_dict(torch.load(ckpt_path, map_location="cpu"), strict=False)
+
+ if trainable:
+ self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+ self.visual_encoder = instantiate_from_config(visual_config)
+
+ self.learning_rate = lr
+ self.clip_dim = clip_dim
+ self.visual_len = visual_len
+ self.visual_dim = visual_dim
+
+ self.text_head = nn.Sequential(*[
+ nn.InstanceNorm1d(self.max_len),
+ nn.Linear(self.emd_dim, self.clip_dim, bias=False),
+ nn.Conv1d(in_channels=self.max_len, out_channels=1, kernel_size=1)
+ ])
+
+ self.visual_head = nn.Sequential(*[
+ nn.InstanceNorm1d(self.visual_len),
+ nn.Linear(self.visual_dim, self.clip_dim, bias=False),
+ nn.Conv1d(in_channels=self.visual_len, out_channels=1, kernel_size=1)
+ ])
+
+
+ def freeze(self):
+ for param in self.parameters():
+ param.requires_grad = False
+
+ def get_index(self, labels):
+
+ indexes = []
+ for label in labels:
+ if len(label) > self.max_len:
+ label = label[:self.max_len]
+ index = [self.character.find(c) + 1 for c in label]
+ index = index + [0] * (self.max_len - len(index))
+ indexes.append(index)
+
+ return torch.tensor(indexes, device=next(self.parameters()).device)
+
+ def get_embeddings(self, x):
+
+ emb = self.label_embedding(x)
+ emb = self.pos_embedding(emb)
+ out = self.encoder(emb)
+
+ return out
+
+ def forward(self, labels):
+
+ idx = self.get_index(labels)
+ out = self.get_embeddings(idx)
+
+ return out
+
+ def encode(self, text):
+ return self(text)
+
+ def get_loss(self, text_out, visual_out, clip_target):
+
+ text_out = text_out / text_out.norm(dim=1, keepdim=True) # b, 1024
+ visual_out = visual_out / visual_out.norm(dim=1, keepdim=True) # b, 1024
+
+ logit_scale = self.logit_scale.exp()
+ logits_per_image = logit_scale * visual_out @ text_out.T # b, b
+ logits_per_text = logits_per_image.T # b, b
+ clip_loss_image = nn.functional.cross_entropy(logits_per_image, clip_target)
+ clip_loss_text = nn.functional.cross_entropy(logits_per_text, clip_target)
+ clip_loss = (clip_loss_image + clip_loss_text) / 2
+ return clip_loss, logits_per_text
+
+ def training_step(self, batch, batch_idx):
+
+ text = batch["text"]
+ image = batch["image"]
+ idx = self.get_index(text)
+ text_emb = self.get_embeddings(idx) # b, l, d
+ visual_emb = self.visual_encoder(image) # b, n, d
+ text_out = self.text_head(text_emb).squeeze(1) # b, 1024
+ visual_out = self.visual_head(visual_emb).squeeze(1) # b, 1024
+
+ cls_target = idx # b, c
+ clip_target = torch.arange(0, idx.shape[0], 1).to(cls_target) # b,
+
+ clip_loss, logits_per_text = self.get_loss(text_out, visual_out, clip_target)
+ loss_dict = {}
+ loss_dict["loss/clip_loss"] = clip_loss
+ clip_idx = torch.max(logits_per_text, dim=-1).indices # b,
+ clip_acc = (clip_idx == clip_target).to(dtype=torch.float32).mean()
+ loss_dict["acc/clip_acc"] = clip_acc
+ self.log_dict(loss_dict, prog_bar=True, batch_size=len(text),
+ logger=True, on_step=True, on_epoch=True, sync_dist=True)
+ return clip_loss
+
+ def configure_optimizers(self):
+
+ lr = self.learning_rate
+ opt = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=lr)
+
+ return opt
diff --git a/src/model/TextGen/preglyph/pretrain.py b/src/model/TextGen/preglyph/pretrain.py
new file mode 100644
index 0000000..669f545
--- /dev/null
+++ b/src/model/TextGen/preglyph/pretrain.py
@@ -0,0 +1,38 @@
+import torch
+import torch.utils.data as data
+import pytorch_lightning as pl
+from omegaconf import OmegaConf
+from utils import instantiate_from_config
+from pytorch_lightning.callbacks import ModelCheckpoint
+
+torch.backends.cudnn.enabled = False
+
+def get_dataloader(cfgs):
+
+ dataset = instantiate_from_config(cfgs.dataset)
+ dataloader = data.DataLoader(dataset=dataset, batch_size=cfgs.batch_size, shuffle=False, num_workers=cfgs.num_workers)
+
+ return dataloader
+
+def get_model(cfgs):
+
+ model = instantiate_from_config(cfgs.model)
+ if "load_ckpt_path" in cfgs:
+ model.load_state_dict(torch.load(cfgs.load_ckpt_path, map_location="cpu")["state_dict"], strict=False)
+
+ return model
+
+def train(cfgs):
+
+ dataloader = get_dataloader(cfgs)
+ model = get_model(cfgs)
+ checkpoint_callback = ModelCheckpoint(every_n_epochs=cfgs.check_freq, save_top_k=-1)
+ trainer = pl.Trainer(callbacks = [checkpoint_callback], **cfgs.lightning)
+ trainer.fit(model=model, train_dataloaders=dataloader)
+
+
+if __name__ == "__main__":
+
+ config_path = 'configs/GlyphTrain.yaml'
+ cfgs = OmegaConf.load(config_path)
+ train(cfgs)
\ No newline at end of file
diff --git a/src/model/TextGen/preglyph/utils.py b/src/model/TextGen/preglyph/utils.py
new file mode 100644
index 0000000..6d91953
--- /dev/null
+++ b/src/model/TextGen/preglyph/utils.py
@@ -0,0 +1,83 @@
+import os
+import json
+import torch
+import argparse
+import importlib
+import torch.nn as nn
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger, TensorBoardLogger
+from pytorch_lightning import Trainer, Callback
+from pytorch_lightning.utilities import rank_zero_only
+
+def get_state_dict(d):
+ return d.get('state_dict', d)
+
+def load_state_dict(ckpt_path, location='cpu'):
+ _, extension = os.path.splitext(ckpt_path)
+ if extension.lower() == ".safetensors":
+ import safetensors.torch
+ state_dict = safetensors.torch.load_file(ckpt_path, device=location)
+ else:
+ state_dict = get_state_dict(torch.load(ckpt_path, map_location=torch.device(location)))
+ state_dict = get_state_dict(state_dict)
+ print(f'Loaded state_dict from [{ckpt_path}]')
+ return state_dict
+
+def count_params(model):
+ total_params = sum(p.numel() for p in model.parameters())
+ print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
+
+
+def get_obj_from_str(string, reload=False):
+ module, cls = string.rsplit(".", 1)
+ if reload:
+ module_imp = importlib.import_module(module)
+ importlib.reload(module_imp)
+ return getattr(importlib.import_module(module, package=None), cls)
+
+
+def instantiate_from_config(config):
+ if not "target" in config:
+ raise KeyError("Expected key `target` to instantiate.")
+ return get_obj_from_str(config["target"])(**config.get("params", dict()))
+
+
+def nondefault_trainer_args(opt):
+ parser = argparse.ArgumentParser()
+ parser = Trainer.add_argparse_args(parser)
+ args = parser.parse_args([])
+ return sorted(k for k in vars(args) if getattr(opt, k) != getattr(args, k))
+
+
+def module_requires_grad(module: nn.Module):
+ for param in module.parameters():
+ if not param.requires_grad:
+ return False
+ return True
+
+@rank_zero_only
+def pl_on_train_tart(pl_module: pl.LightningModule):
+ wandb_logger = pl_module.logger.experiment
+ if isinstance(wandb_logger, WandbLogger):
+ print("Logging code")
+ wandb_logger.log_code(
+ os.getcwd(),
+ include_fn=lambda path: path.endswith(".py") or path.endswith(".ipynb") or path.endswith(".yaml")
+ )
+ elif isinstance(wandb_logger, TensorBoardLogger):
+ print("Logging git info")
+ wandb_logger.log_hyperparams({"git_version": os.popen("git log -1").read().split("\n")[0]})
+
+ print("***** Start training *****")
+ num_samples = len(pl_module.trainer.train_dataloader.dataset)
+ max_epoch = pl_module.trainer.max_epochs
+ total_step = pl_module.trainer.estimated_stepping_batches
+ total_batch_size = round(num_samples * max_epoch / total_step)
+ print(f" Num examples = {num_samples}")
+ print(f" Num Epochs = {max_epoch}")
+ print(f" Total GPU device number: {pl_module.trainer.num_devices}")
+ print(f" Gradient Accumulation steps = {pl_module.trainer.accumulate_grad_batches}")
+ print(f" Instant batch size: {round(total_batch_size * pl_module.trainer.num_devices)}")
+ print(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ print(f" Total optimization steps = {total_step}")
+
\ No newline at end of file
diff --git a/src/model/TextGen/prestyle/.idea/.gitignore b/src/model/TextGen/prestyle/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/src/model/TextGen/prestyle/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/src/model/TextGen/prestyle/.idea/Disentangle.iml b/src/model/TextGen/prestyle/.idea/Disentangle.iml
new file mode 100644
index 0000000..8b8c395
--- /dev/null
+++ b/src/model/TextGen/prestyle/.idea/Disentangle.iml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/model/TextGen/prestyle/.idea/deployment.xml b/src/model/TextGen/prestyle/.idea/deployment.xml
new file mode 100644
index 0000000..fa2baaa
--- /dev/null
+++ b/src/model/TextGen/prestyle/.idea/deployment.xml
@@ -0,0 +1,56 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/model/TextGen/prestyle/.idea/inspectionProfiles/Project_Default.xml b/src/model/TextGen/prestyle/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..08ed659
--- /dev/null
+++ b/src/model/TextGen/prestyle/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,102 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/model/TextGen/prestyle/.idea/inspectionProfiles/profiles_settings.xml b/src/model/TextGen/prestyle/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/src/model/TextGen/prestyle/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/model/TextGen/prestyle/.idea/misc.xml b/src/model/TextGen/prestyle/.idea/misc.xml
new file mode 100644
index 0000000..a971a2c
--- /dev/null
+++ b/src/model/TextGen/prestyle/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/src/model/TextGen/prestyle/.idea/modules.xml b/src/model/TextGen/prestyle/.idea/modules.xml
new file mode 100644
index 0000000..4caa121
--- /dev/null
+++ b/src/model/TextGen/prestyle/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/model/TextGen/prestyle/configs/StyleTrain.yaml b/src/model/TextGen/prestyle/configs/StyleTrain.yaml
new file mode 100644
index 0000000..e052edc
--- /dev/null
+++ b/src/model/TextGen/prestyle/configs/StyleTrain.yaml
@@ -0,0 +1,60 @@
+data:
+ target: "dataset.WrappedDataModule"
+ batch_size: 256
+ train:
+ size: 128
+ root_dir: ".../Syn_data/train/"
+ font_dir: ".../Syn_data/fonts/"
+ template_font: ".../Syn_data/fonts/arial.ttf"
+
+ validation:
+ size: 128
+ root_dir: ".../Syn_data/eval/"
+ font_dir: ".../Syn_data/fonts/"
+ template_font: ".../Syn_data/fonts/arial.ttf"
+
+num_workers: 16
+check_freq: 10
+
+
+
+model:
+ target: trainer.StyleNet
+ params:
+ image_size: 128
+ patch_size: 16
+ in_chans: 3
+ num_classes: 0
+ emb_dim: 768
+ ckpt_path:
+ trainable: True
+ lr: 1e-5
+ alpha_removal: 1.0
+ alpha_seg: 1.0
+ alpha_color: 1.0
+ alpha_font: 1.0
+ enable_spatial_head: True
+ enable_glyph_head: True
+
+
+
+
+
+lightning:
+ log_every_n_steps: 50
+ accumulate_grad_batches: 1
+ max_epochs: 80
+ accelerator: "cuda"
+ devices: [0, 1, 2, 3]
+ strategy: ddp
+ default_root_dir: "./logs/pre_style_logs"
+
+
+imagelogger_callback:
+ target: "trainer.StyleNetImageLogger"
+ params:
+ train_batch_frequency: 4500
+ valid_batch_frequency: 2
+ disable_wandb: false
+ generation_kwargs:
+ seed: 42
diff --git a/src/model/TextGen/prestyle/dataset.py b/src/model/TextGen/prestyle/dataset.py
new file mode 100644
index 0000000..9d728e3
--- /dev/null
+++ b/src/model/TextGen/prestyle/dataset.py
@@ -0,0 +1,188 @@
+import os
+import pytorch_lightning as pl
+from PIL import Image, ImageFont, ImageDraw
+from torch.utils.data import Dataset
+import torchvision.transforms as transforms
+import cv2
+import numpy as np
+from torch.utils.data import DataLoader
+
+
+class WrappedDataModule(pl.LightningDataModule):
+ def __init__(self, data_config, **kwargs):
+ super().__init__()
+ self.save_hyperparameters()
+ self.config = data_config
+ self.batch_size = data_config.batch_size
+
+ def setup(self, stage: str):
+ if stage == "fit":
+ self.train = StyleDataset(self.config.train)
+ self.val = StyleDataset(self.config.validation)
+ if stage == "test" or stage == "predict":
+ self.val = StyleDataset(self.config.test)
+
+ def train_dataloader(self):
+ return DataLoader(
+ self.train,
+ batch_size=self.batch_size,
+ shuffle=True,
+ num_workers=8,
+ )
+
+ def val_dataloader(self):
+ return DataLoader(
+ self.val,
+ batch_size=self.batch_size,
+ shuffle=False,
+ num_workers=8,
+ )
+
+
+
+
+class StyleDataset(Dataset):
+ def __init__(self, config):
+ super().__init__()
+ self.size = config.size
+ self.data_dir_list = []
+ self.template_font = config.template_font
+ for dir_name in os.listdir(config.root_dir):
+ self.data_dir_list.append(os.path.join(config.root_dir, dir_name))
+ self.font_paths = [os.path.join(config.font_dir, font_name) for font_name in os.listdir(config.font_dir)]
+
+ # source image
+ self.i_s = []
+ # background image
+ self.t_b = []
+ # segmentation image
+ self.mask_s = []
+ # colorize mask and image
+ self.mask_t = []
+ self.t_t = []
+
+
+ # source text and target text and font
+ self.text_s = []
+ self.text_t = []
+ self.class_font = []
+
+ for data_dir in self.data_dir_list:
+ tmp_names = []
+ with open(os.path.join(data_dir, 'i_s.txt'), 'r') as f_s:
+ lines = f_s.readlines()
+ for line in lines:
+ name, text = line.split(' ')[:]
+ self.text_s.append(text)
+ tmp_names.append(name)
+ with open(os.path.join(data_dir, 'i_t.txt'), 'r') as f_t:
+ lines = f_t.readlines()
+ for line in lines:
+ _, text = line.split(' ')[:]
+ self.text_t.append(text)
+ with open(os.path.join(data_dir, 'font.txt'), 'r') as f_f:
+ lines = f_f.readlines()
+ for line in lines:
+ _, text = line.split(' ')[:]
+ self.class_font.append(config.font_dir + text.rsplit('/', maxsplit=1)[1])
+ for tmp_name in tmp_names:
+ self.i_s.append(os.path.join(data_dir, 'i_s', tmp_name))
+ self.t_b.append(os.path.join(data_dir, 't_b', tmp_name))
+ self.mask_s.append(os.path.join(data_dir, 'mask_s', tmp_name))
+ self.mask_t.append(os.path.join(data_dir, 'mask_t', tmp_name))
+ self.t_t.append(os.path.join(data_dir, 't_t', tmp_name))
+
+
+
+ self.resize = transforms.Resize((self.size, self.size), transforms.InterpolationMode.BICUBIC, antialias=True)
+ self.transfer = transforms.ToTensor()
+ print(f"In all collected {len(self.i_s)} sample")
+
+
+ def __len__(self):
+ return len(self.i_s)
+
+ def detect_edges(self, input_array):
+ edges = cv2.Canny(input_array, 100, 200)
+ return edges
+
+ def __getitem__(self, index):
+ img_i_s = Image.open(self.i_s[index]).convert("RGB")
+ img_i_s = self.resize(img_i_s)
+ i_s = self.transfer(img_i_s)
+
+ # Removal
+ img_bg = Image.open(self.t_b[index]).convert("RGB")
+ img_bg = self.resize(img_bg)
+ bg = self.transfer(img_bg)
+
+ # Segmentation
+ img_mask_s = Image.open(self.mask_s[index])
+ img_seg = self.resize(img_mask_s)
+ seg = self.transfer(img_seg)
+
+ # Colorized
+ img_color_template = Image.open(self.mask_t[index]).convert("RGB")
+ img_color_template = self.resize(img_color_template)
+ color_template = self.transfer(img_color_template)
+
+ img_color_result = Image.open(self.t_t[index]).convert("RGB")
+ img_color_result = self.resize(img_color_result)
+ array_color_result = np.array(img_color_result)
+
+ color_mask = Image.open(self.mask_t[index]).convert("L")
+ color_mask = self.resize(color_mask)
+ color_mask = self.transfer(color_mask)
+ color_mask = np.array(color_mask)
+
+ for i in range(array_color_result.shape[0]):
+ for j in range(array_color_result.shape[1]):
+ array_color_result[i, j, :] = array_color_result[i, j, :] * color_mask[0, i, j]
+ color_result = self.transfer(array_color_result)
+
+ # Fontilized
+ text_s = self.text_s[index].strip()
+ text_t = self.text_t[index].strip()
+
+ template_font = ImageFont.truetype(self.template_font, 64)
+ std_l, std_t, std_r, std_b = template_font.getbbox(text_t)
+ std_h, std_w = std_b - std_t, std_r - std_l
+ img_font_template = Image.new('RGB', (std_w + 10, std_h + 10), color=(0, 0, 0))
+ draw = ImageDraw.Draw(img_font_template)
+ draw.text((5, 5), text_t, fill=(255, 255, 255), font=template_font, anchor="lt")
+ img_font_template = self.resize(img_font_template)
+ array_font_template = np.array(img_font_template)
+ edge_font_template = self.detect_edges(array_font_template)
+ font_template = self.transfer(edge_font_template)
+ font_template = font_template.repeat(3, 1, 1)
+
+ style_font = self.class_font[index].strip()
+ result_font = ImageFont.truetype(style_font, 64)
+ std_l, std_t, std_r, std_b = result_font.getbbox(text_t)
+ std_h, std_w = std_b - std_t, std_r - std_l
+ img_font_result = Image.new('RGB', (std_w + 10, std_h + 10), color=(0, 0, 0))
+ draw = ImageDraw.Draw(img_font_result)
+ draw.text((5, 5), text_t, fill=(255, 255, 255), font=result_font, anchor="lt")
+ img_font_result = self.resize(img_font_result)
+ array_font_result = np.array(img_font_result)
+ edge_font_result = self.detect_edges(array_font_result)
+ font_result = self.transfer(edge_font_result)
+ font_result = font_result.repeat(3, 1, 1)
+
+ batch = {
+ "text_s": text_s,
+ "text_t": text_t,
+ "i_s": i_s,
+ "bg": bg,
+ "seg": seg,
+ "c_t": color_template,
+ "c_r": color_result,
+ "f_t": font_template,
+ "f_r": font_result,
+ }
+ return batch
+
+
+
+
+
diff --git a/src/model/TextGen/prestyle/extractors.py b/src/model/TextGen/prestyle/extractors.py
new file mode 100644
index 0000000..4863ea1
--- /dev/null
+++ b/src/model/TextGen/prestyle/extractors.py
@@ -0,0 +1,366 @@
+from collections import OrderedDict
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils import model_zoo
+from torchvision.models.densenet import densenet121, densenet161
+from torchvision.models.squeezenet import squeezenet1_1
+
+
+def load_weights_sequential(target, source_state):
+ new_dict = OrderedDict()
+ for (k1, v1), (k2, v2) in zip(target.state_dict().items(), source_state.items()):
+ new_dict[k1] = v2
+ target.load_state_dict(new_dict)
+
+'''
+ Implementation of dilated ResNet-101 with deep supervision. Downsampling is changed to 8x
+'''
+model_urls = {
+ 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+ 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+ 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+ 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+ 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1, dilation=1):
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+ padding=dilation, dilation=dilation, bias=False)
+
+
+class BasicBlock(nn.Module):
+ expansion = 1
+
+ def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1):
+ super(BasicBlock, self).__init__()
+ self.conv1 = conv3x3(inplanes, planes, stride=stride, dilation=dilation)
+ self.bn1 = nn.BatchNorm2d(planes)
+ self.relu = nn.ReLU(inplace=True)
+ self.conv2 = conv3x3(planes, planes, stride=1, dilation=dilation)
+ self.bn2 = nn.BatchNorm2d(planes)
+ self.downsample = downsample
+ self.stride = stride
+
+ def forward(self, x):
+ residual = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+
+ if self.downsample is not None:
+ residual = self.downsample(x)
+
+ out += residual
+ out = self.relu(out)
+
+ return out
+
+
+class Bottleneck(nn.Module):
+ expansion = 4
+
+ def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1):
+ super(Bottleneck, self).__init__()
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+ self.bn1 = nn.BatchNorm2d(planes)
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, dilation=dilation,
+ padding=dilation, bias=False)
+ self.bn2 = nn.BatchNorm2d(planes)
+ self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+ self.bn3 = nn.BatchNorm2d(planes * 4)
+ self.relu = nn.ReLU(inplace=True)
+ self.downsample = downsample
+ self.stride = stride
+
+ def forward(self, x):
+ residual = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+ out = self.relu(out)
+
+ out = self.conv3(out)
+ out = self.bn3(out)
+
+ if self.downsample is not None:
+ residual = self.downsample(x)
+
+ out += residual
+ out = self.relu(out)
+
+ return out
+
+
+class ResNet(nn.Module):
+ def __init__(self, block, layers=(3, 4, 23, 3)):
+ self.inplanes = 64
+ super(ResNet, self).__init__()
+ self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+ bias=False)
+ self.bn1 = nn.BatchNorm2d(64)
+ self.relu = nn.ReLU(inplace=True)
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+ self.layer1 = self._make_layer(block, 64, layers[0])
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2)
+ #self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4)
+ self.layer4 = self._make_layer(block, 768, layers[3], stride=1, dilation=4)
+
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+ m.weight.data.normal_(0, math.sqrt(2. / n))
+ elif isinstance(m, nn.BatchNorm2d):
+ m.weight.data.fill_(1)
+ m.bias.data.zero_()
+
+ def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
+ downsample = None
+ if stride != 1 or self.inplanes != planes * block.expansion:
+ downsample = nn.Sequential(
+ nn.Conv2d(self.inplanes, planes * block.expansion,
+ kernel_size=1, stride=stride, bias=False),
+ nn.BatchNorm2d(planes * block.expansion),
+ )
+
+ layers = [block(self.inplanes, planes, stride, downsample)]
+ self.inplanes = planes * block.expansion
+ for i in range(1, blocks):
+ layers.append(block(self.inplanes, planes, dilation=dilation))
+
+ return nn.Sequential(*layers)
+
+ def forward(self, x):
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = self.relu(x)
+ x = self.maxpool(x)
+
+ x = self.layer1(x)
+ x = self.layer2(x)
+ x_3 = self.layer3(x)
+ x = self.layer4(x_3)
+
+ return x, x_3
+
+
+'''
+ Implementation of DenseNet with deep supervision. Downsampling is changed to 8x
+'''
+
+
+class _DenseLayer(nn.Sequential):
+ def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
+ super(_DenseLayer, self).__init__()
+ self.add_module('norm.1', nn.BatchNorm2d(num_input_features)),
+ self.add_module('relu.1', nn.ReLU(inplace=True)),
+ self.add_module('conv.1', nn.Conv2d(num_input_features, bn_size *
+ growth_rate, kernel_size=1, stride=1, bias=False)),
+ self.add_module('norm.2', nn.BatchNorm2d(bn_size * growth_rate)),
+ self.add_module('relu.2', nn.ReLU(inplace=True)),
+ self.add_module('conv.2', nn.Conv2d(bn_size * growth_rate, growth_rate,
+ kernel_size=3, stride=1, padding=1, bias=False)),
+ self.drop_rate = drop_rate
+
+ def forward(self, x):
+ new_features = super(_DenseLayer, self).forward(x)
+ if self.drop_rate > 0:
+ new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
+ return torch.cat([x, new_features], 1)
+
+
+class _DenseBlock(nn.Sequential):
+ def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
+ super(_DenseBlock, self).__init__()
+ for i in range(num_layers):
+ layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
+ self.add_module('denselayer%d' % (i + 1), layer)
+
+
+class _Transition(nn.Sequential):
+ def __init__(self, num_input_features, num_output_features, downsample=True):
+ super(_Transition, self).__init__()
+ self.add_module('norm', nn.BatchNorm2d(num_input_features))
+ self.add_module('relu', nn.ReLU(inplace=True))
+ self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
+ kernel_size=1, stride=1, bias=False))
+ if downsample:
+ self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
+ else:
+ self.add_module('pool', nn.AvgPool2d(kernel_size=1, stride=1)) # compatibility hack
+
+
+class DenseNet(nn.Module):
+ def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
+ num_init_features=64, bn_size=4, drop_rate=0, pretrained=True):
+
+ super(DenseNet, self).__init__()
+
+ # First convolution
+ self.start_features = nn.Sequential(OrderedDict([
+ ('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
+ ('norm0', nn.BatchNorm2d(num_init_features)),
+ ('relu0', nn.ReLU(inplace=True)),
+ ('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
+ ]))
+
+ # Each denseblock
+ num_features = num_init_features
+
+ init_weights = list(densenet121(pretrained=True).features.children())
+ start = 0
+ for i, c in enumerate(self.start_features.children()):
+ if pretrained:
+ c.load_state_dict(init_weights[i].state_dict())
+ start += 1
+ self.blocks = nn.ModuleList()
+ for i, num_layers in enumerate(block_config):
+ block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
+ bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
+ if pretrained:
+ block.load_state_dict(init_weights[start].state_dict())
+ start += 1
+ self.blocks.append(block)
+ setattr(self, 'denseblock%d' % (i + 1), block)
+
+ num_features = num_features + num_layers * growth_rate
+ if i != len(block_config) - 1:
+ downsample = i < 1
+ trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2,
+ downsample=downsample)
+ if pretrained:
+ trans.load_state_dict(init_weights[start].state_dict())
+ start += 1
+ self.blocks.append(trans)
+ setattr(self, 'transition%d' % (i + 1), trans)
+ num_features = num_features // 2
+
+ def forward(self, x):
+ out = self.start_features(x)
+ deep_features = None
+ for i, block in enumerate(self.blocks):
+ out = block(out)
+ if i == 5:
+ deep_features = out
+
+ return out, deep_features
+
+
+class Fire(nn.Module):
+
+ def __init__(self, inplanes, squeeze_planes,
+ expand1x1_planes, expand3x3_planes, dilation=1):
+ super(Fire, self).__init__()
+ self.inplanes = inplanes
+ self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
+ self.squeeze_activation = nn.ReLU(inplace=True)
+ self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes,
+ kernel_size=1)
+ self.expand1x1_activation = nn.ReLU(inplace=True)
+ self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes,
+ kernel_size=3, padding=dilation, dilation=dilation)
+ self.expand3x3_activation = nn.ReLU(inplace=True)
+
+ def forward(self, x):
+ x = self.squeeze_activation(self.squeeze(x))
+ return torch.cat([
+ self.expand1x1_activation(self.expand1x1(x)),
+ self.expand3x3_activation(self.expand3x3(x))
+ ], 1)
+
+
+class SqueezeNet(nn.Module):
+
+ def __init__(self, pretrained=False):
+ super(SqueezeNet, self).__init__()
+
+ self.feat_1 = nn.Sequential(
+ nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1),
+ nn.ReLU(inplace=True)
+ )
+ self.feat_2 = nn.Sequential(
+ nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+ Fire(64, 16, 64, 64),
+ Fire(128, 16, 64, 64)
+ )
+ self.feat_3 = nn.Sequential(
+ nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+ Fire(128, 32, 128, 128, 2),
+ Fire(256, 32, 128, 128, 2)
+ )
+ self.feat_4 = nn.Sequential(
+ Fire(256, 48, 192, 192, 4),
+ Fire(384, 48, 192, 192, 4),
+ Fire(384, 64, 256, 256, 4),
+ Fire(512, 64, 256, 256, 4)
+ )
+ if pretrained:
+ weights = squeezenet1_1(pretrained=True).features.state_dict()
+ load_weights_sequential(self, weights)
+
+ def forward(self, x):
+ f1 = self.feat_1(x)
+ f2 = self.feat_2(f1)
+ f3 = self.feat_3(f2)
+ f4 = self.feat_4(f3)
+ return f4, f3
+
+
+'''
+ Handy methods for construction
+'''
+
+
+def squeezenet(pretrained=True):
+ return SqueezeNet(pretrained)
+
+
+def densenet(pretrained=True):
+ return DenseNet(pretrained=pretrained)
+
+
+def resnet18(pretrained=True):
+ model = ResNet(BasicBlock, [2, 2, 2, 2])
+ if pretrained:
+ load_weights_sequential(model, model_zoo.load_url(model_urls['resnet18']))
+ return model
+
+
+def resnet34(pretrained=True):
+ model = ResNet(BasicBlock, [3, 4, 6, 3])
+ if pretrained:
+ load_weights_sequential(model, model_zoo.load_url(model_urls['resnet34']))
+ return model
+
+
+def resnet50(pretrained=True):
+ model = ResNet(Bottleneck, [3, 4, 6, 3])
+ if pretrained:
+ load_weights_sequential(model, model_zoo.load_url(model_urls['resnet50']))
+ return model
+
+
+def resnet101(pretrained=True):
+ model = ResNet(Bottleneck, [3, 4, 23, 3])
+ if pretrained:
+ load_weights_sequential(model, model_zoo.load_url(model_urls['resnet101']))
+ return model
+
+
+def resnet152(pretrained=True):
+ model = ResNet(Bottleneck, [3, 8, 36, 3])
+ if pretrained:
+ load_weights_sequential(model, model_zoo.load_url(model_urls['resnet152']))
+ return model
diff --git a/src/model/TextGen/prestyle/model.py b/src/model/TextGen/prestyle/model.py
new file mode 100644
index 0000000..e5d1c6c
--- /dev/null
+++ b/src/model/TextGen/prestyle/model.py
@@ -0,0 +1,473 @@
+import torch
+import numpy as np
+import torch.nn as nn
+from functools import partial
+import torch.nn.functional as F
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+from einops import rearrange
+class PatchEmbed(nn.Module):
+ """
+ Image to Patch Embedding
+ """
+ def __init__(self, img_size=128, patch_size=16, in_chans=3, embed_dim=768):
+ super().__init__()
+ img_size = to_2tuple(img_size)
+ patch_size = to_2tuple(patch_size)
+ num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+ self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.num_patches = num_patches
+
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+ def forward(self, x, **kwargs):
+ B, C, H, W = x.shape
+ # FIXME look at relaxing size constraints
+ assert H == self.img_size[0] and W == self.img_size[1], \
+ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+ x = self.proj(x).flatten(2).transpose(1, 2)
+ return x
+
+
+class Attention(nn.Module):
+ def __init__(
+ self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+ proj_drop=0., attn_head_dim=None):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ if attn_head_dim is not None:
+ head_dim = attn_head_dim
+ all_head_dim = head_dim * self.num_heads
+ self.scale = qk_scale or head_dim ** -0.5
+
+ self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+ if qkv_bias:
+ self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+ self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+ else:
+ self.q_bias = None
+ self.v_bias = None
+
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(all_head_dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ def forward(self, x):
+ B, N, C = x.shape
+ qkv_bias = None
+ if self.q_bias is not None:
+ qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+ # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+ qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
+
+ q = q * self.scale
+ attn = (q @ k.transpose(-2, -1))
+
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+class Mlp(nn.Module):
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class DropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+ """
+
+ def __init__(self, drop_prob=None):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, x):
+ return drop_path(x, self.drop_prob, self.training)
+
+ def extra_repr(self) -> str:
+ return 'p={}'.format(self.drop_prob)
+
+
+class Block(nn.Module):
+
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+ attn_head_dim=None):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim)
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ if init_values > 0:
+ self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+ self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+ else:
+ self.gamma_1, self.gamma_2 = None, None
+
+ def forward(self, x):
+ if self.gamma_1 is None:
+ x = x + self.drop_path(self.attn(self.norm1(x)))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ else:
+ x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+ x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+ return x
+
+
+def get_sinusoid_encoding_table(n_position, d_hid):
+ ''' Sinusoid position encoding table '''
+ # TODO: make it with torch instead of numpy
+ def get_position_angle_vec(position):
+ return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+
+ sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+ sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
+ sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
+
+ return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+
+
+class VisionTransformerEncoder(nn.Module):
+ def __init__(self, img_size=128, patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12,
+ num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+ drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None,
+ use_learnable_pos_emb=False):
+ super().__init__()
+ self.num_classes = num_classes
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+
+ self.patch_embed = PatchEmbed(
+ img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+ num_patches = self.patch_embed.num_patches
+
+ # Pos_emb
+ if use_learnable_pos_emb:
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+ else:
+ # sine-cosine positional embeddings
+ self.pos_embed = get_sinusoid_encoding_table(num_patches, embed_dim)
+
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
+ self.blocks = nn.ModuleList([
+ Block(
+ dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+ init_values=init_values)
+ for i in range(depth)])
+ self.norm = norm_layer(embed_dim)
+ self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ if use_learnable_pos_emb:
+ trunc_normal_(self.pos_embed, std=.02)
+
+ # trunc_normal_(self.cls_token, std=.02)
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ nn.init.xavier_uniform_(m.weight)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+
+ def get_num_layers(self):
+ return len(self.blocks)
+
+ @torch.jit.ignore
+ def no_weight_decay(self):
+ return {'pos_embed', 'cls_token'}
+
+ def get_classifier(self):
+ return self.head
+
+ def reset_classifier(self, num_classes, global_pool=''):
+ self.num_classes = num_classes
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ def forward_features(self, x, mask):
+ x = self.patch_embed(x)
+
+ # cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+ # x = torch.cat((cls_tokens, x), dim=1)
+ x = x + self.pos_embed.type_as(x).to(x.device).clone().detach()
+
+ B, _, C = x.shape
+ if mask == None:
+ x_vis = x.reshape(B, -1, C)
+ else:
+ x_vis = x[~mask].reshape(B, -1, C) # ~mask means visible
+
+ for blk in self.blocks:
+ x_vis = blk(x_vis)
+
+ x_vis = self.norm(x_vis)
+ return x_vis
+
+ def forward(self, x, mask):
+ x = self.forward_features(x, mask)
+ x = self.head(x)
+ return x
+
+
+class StyleEncoder(nn.Module):
+ def __init__(self, image_size=128, patch_size=16, in_chans=3, embed_dim=768):
+ super().__init__()
+ self.image_size = image_size
+ self.patch_size = patch_size
+ # Backbone.
+ self.vit = VisionTransformerEncoder(img_size=image_size, patch_size=patch_size,in_chans=in_chans,
+ num_classes=0,
+ embed_dim=embed_dim, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
+ qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.,
+ norm_layer=partial(nn.LayerNorm, eps=1e-6), init_values=0,
+ use_learnable_pos_emb=False,)
+
+ # Block for spatial and glyph.
+ self.spatial_attn_block = nn.ModuleList([
+ Block(
+ dim=embed_dim, num_heads=12, mlp_ratio=4, qkv_bias=True, qk_scale=None,
+ drop=0., attn_drop=0., drop_path=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
+ init_values=0)
+ for _ in range(1)])
+
+ self.glyph_attn_block = nn.ModuleList([
+ Block(
+ dim=embed_dim, num_heads=12, mlp_ratio=4, qkv_bias=True, qk_scale=None,
+ drop=0., attn_drop=0., drop_path=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
+ init_values=0)
+ for _ in range(1)])
+
+
+ def forward(self, x):
+ # Backbone
+ x_feature = self.vit(x, mask=None)
+
+ # Spatial
+ x_spatial = x_feature
+ for block in self.spatial_attn_block:
+ x_spatial = block(x_spatial)
+
+ # Glyph
+ x_glyph = x_feature
+ for block in self.glyph_attn_block:
+ x_glyph = block(x_glyph)
+
+ return x_spatial, x_glyph
+
+
+
+class SpatialHead(nn.Module):
+
+ def __init__(self, image_size=128, patch_size=16, in_chans=3, embed_dim=768):
+ super().__init__()
+ self.image_size = image_size
+ self.patch_size = patch_size
+ # Block for spatial
+ self.spatial_res_block = Residual_with_Attention(image_size=image_size, patch_size=patch_size,
+ in_channels=embed_dim, out_channels=embed_dim)
+
+ # Head for removal, segmentation.
+ self.removal_head = nn.Sequential(
+ nn.Conv2d(in_channels=embed_dim, out_channels=embed_dim, kernel_size=3, stride=1, padding=1),
+ nn.InstanceNorm2d(embed_dim),
+ nn.ELU(alpha=1.0),
+ nn.Conv2d(in_channels=embed_dim, out_channels=3 * self.patch_size * self.patch_size, kernel_size=1,
+ stride=1, padding=0),
+ )
+
+ self.segmentation_head = nn.Sequential(
+ nn.Conv2d(in_channels=embed_dim, out_channels=embed_dim, kernel_size=3, stride=1, padding=1),
+ nn.InstanceNorm2d(embed_dim),
+ nn.ReLU(),
+ nn.Conv2d(in_channels=embed_dim, out_channels=1 * self.patch_size * self.patch_size, kernel_size=1,
+ stride=1, padding=0),
+ nn.Sigmoid()
+ )
+
+ def forward(self, x_spatial):
+ x_out, x_seg = self.spatial_res_block(x_spatial)
+ x_spatial = x_out + x_spatial
+
+ x_removal = rearrange(x_spatial, 'b (h w) c -> b c h w', h=self.image_size // self.patch_size)
+ x_seg = self.segmentation_head(x_seg)
+ x_removal = self.removal_head(x_removal)
+
+ out_removal = rearrange(x_removal, 'b (p1 p2 d) h w -> b d (h p1) (w p2)', p1=self.patch_size, p2=self.patch_size)
+ out_seg = rearrange(x_seg, 'b (p1 p2) h w -> b 1 (h p1) (w p2)', p1=self.patch_size, p2=self.patch_size)
+
+ return out_removal, out_seg
+
+
+
+class GlyphHead(nn.Module):
+ def __init__(self, image_size=128, patch_size=16, in_chans=3, embed_dim=768, res_dim=768,
+ color_backend='resnet34', font_backend='resnet34', pretrained=False):
+ super().__init__()
+ self.image_size = image_size
+ self.patch_size = patch_size
+
+ # Color
+ self.color_encoder = getattr(extractors, color_backend)(pretrained)
+ self.align_to_color_feature = nn.Sequential(
+ nn.Conv2d(embed_dim, res_dim, kernel_size=3, stride=1, padding=1, dilation=1),
+ nn.InstanceNorm2d(res_dim),
+ nn.ELU(alpha=1.0)
+ )
+ self.color_head = nn.Sequential(
+ Upsample(res_dim, 256),
+ Upsample(256, 64),
+ Upsample(64, 3),
+ )
+
+
+ # Font
+ self.font_encoder = getattr(extractors, font_backend)(pretrained)
+ self.align_to_font_feature = nn.Sequential(
+ Upsample(embed_dim, res_dim),
+ nn.Conv2d(res_dim, res_dim, kernel_size=3, stride=1, padding=1, dilation=1),
+ nn.InstanceNorm2d(res_dim),
+ nn.ELU(alpha=1.0),
+
+ )
+ self.font_psp = PSPModule(res_dim * 2, res_dim, sizes=(1, 2, 3, 6))
+ self.font_head = nn.Sequential(
+ Upsample(res_dim, 256),
+ Upsample(256, 64),
+ Upsample(64, 3),
+ nn.Sigmoid()
+ )
+
+
+
+ def forward(self, x_glyph, c_t, f_t, alpha=1.0):
+ x_glyph = rearrange(x_glyph, 'b (h w) c -> b c h w', h=self.image_size // self.patch_size)
+
+
+ ct_feature, _ = self.color_encoder(c_t)
+ x_c_feature = self.align_to_color_feature(x_glyph)
+ out_c = adain(ct_feature, x_c_feature)
+ out_c = alpha * out_c + (1-alpha) * ct_feature
+ out_c = self.color_head(out_c)
+
+ ft_feature, _ = self.font_encoder(f_t)
+ x_f_feature = self.align_to_font_feature(x_glyph)
+ out_f = self.font_psp(torch.cat((ft_feature, x_f_feature), dim=1))
+ out_f = self.font_head(out_f)
+
+ return out_c, out_f
+
+class Residual_with_Attention(nn.Module):
+ def __init__(self, image_size=128, patch_size=16, in_channels=768, out_channels=768, stride=1, padding=1, dilation=1):
+ super().__init__()
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.conv1 = nn.Conv2d(
+ in_channels, out_channels, kernel_size=3, stride=stride, padding=padding, dilation=dilation
+ )
+ self.conv2 = nn.Conv2d(
+ out_channels, out_channels, kernel_size=3, stride=1, padding=padding, dilation=dilation
+ )
+ self.conv_pool = nn.Conv2d(2, 1, kernel_size=7, stride=1, padding=3, dilation=dilation, bias=False)
+ self.sig = nn.Sigmoid()
+
+ def forward(self, x): # x --> [B N C]
+ x_in = rearrange(x, 'b (h w) c -> b c h w', h=self.image_size//self.patch_size)
+ x_in = self.conv2(self.conv1(x_in))
+ avg_pool = torch.mean(x_in, dim=1, keepdim=True)
+ max_pool, _ = torch.max(x_in, dim=1, keepdim=True)
+ pool_mask = self.sig(self.conv_pool(torch.cat((avg_pool, max_pool), dim=1)))
+ x_seg = x_in * pool_mask
+ x_out = rearrange(x_seg, 'b c h w -> b (h w) c')
+
+ return x_out, x_seg
+
+
+import extractors
+
+
+class PSPModule(nn.Module):
+ def __init__(self, features, out_features=768, sizes=(1, 2, 3, 6)):
+ super().__init__()
+ self.stages = []
+ self.stages = nn.ModuleList([self._make_stage(features, size) for size in sizes])
+ self.bottleneck = nn.Conv2d(features * (len(sizes) + 1), out_features, kernel_size=1)
+ self.relu = nn.ELU(alpha=1.0)
+
+ def _make_stage(self, features, size):
+ prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
+ conv = nn.Conv2d(features, features, kernel_size=1, bias=False)
+ return nn.Sequential(prior, conv)
+
+ def forward(self, feats):
+ h, w = feats.size(2), feats.size(3)
+ priors = [F.upsample(input=stage(feats), size=(h, w), mode='bilinear') for stage in self.stages] + [feats]
+ bottle = self.bottleneck(torch.cat(priors, 1))
+ return self.relu(bottle)
+
+
+class Upsample(nn.Module):
+ def __init__(self, in_channels, out_channels):
+ super().__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(in_channels, out_channels, 3, padding=1),
+ nn.BatchNorm2d(out_channels),
+ nn.PReLU()
+ )
+
+ def forward(self, x):
+ h, w = 2 * x.size(2), 2 * x.size(3)
+ p = F.upsample(input=x, size=(h, w), mode='bilinear')
+ return self.conv(p)
+
+def calc_mean_std(features):
+ """
+
+ :param features: shape of features -> [batch_size, c, h, w]
+ :return: features_mean, feature_s: shape of mean/std ->[batch_size, c, 1, 1]
+ """
+
+ batch_size, c = features.size()[:2]
+ features_mean = features.reshape(batch_size, c, -1).mean(dim=2).reshape(batch_size, c, 1, 1)
+ features_std = features.reshape(batch_size, c, -1).std(dim=2).reshape(batch_size, c, 1, 1) + 1e-6
+ return features_mean, features_std
+
+def adain(content_features, style_features):
+ """
+ Adaptive Instance Normalization
+
+ :param content_features: shape -> [batch_size, c, h, w]
+ :param style_features: shape -> [batch_size, c, h, w]
+ :return: normalized_features shape -> [batch_size, c, h, w]
+ """
+ content_mean, content_std = calc_mean_std(content_features)
+ style_mean, style_std = calc_mean_std(style_features)
+ normalized_features = style_std * (content_features - content_mean) / content_std + style_mean
+ return normalized_features
diff --git a/src/model/TextGen/prestyle/train.py b/src/model/TextGen/prestyle/train.py
new file mode 100644
index 0000000..c3e84ef
--- /dev/null
+++ b/src/model/TextGen/prestyle/train.py
@@ -0,0 +1,54 @@
+import torch
+import torch.utils.data as data
+import pytorch_lightning as pl
+from omegaconf import OmegaConf
+import importlib
+from pytorch_lightning.callbacks import ModelCheckpoint
+
+def get_obj_from_str(string, reload=False, invalidate_cache=True):
+ module, cls = string.rsplit(".", 1)
+ if invalidate_cache:
+ importlib.invalidate_caches()
+ if reload:
+ module_imp = importlib.import_module(module)
+ importlib.reload(module_imp)
+ return getattr(importlib.import_module(module, package=None), cls)
+
+def instantiate_from_config(config):
+ if not "target" in config:
+ raise KeyError("Expected key `target` to instantiate.")
+ return get_obj_from_str(config["target"])(**config.get("params", dict()))
+
+def create_data(config):
+ data_cls = get_obj_from_str(config.target)
+ data = data_cls(data_config=config)
+ return data
+
+
+def get_model(cfgs):
+ model = instantiate_from_config(cfgs.model)
+ if "load_ckpt_path" in cfgs:
+ model.load_state_dict(torch.load(cfgs.load_ckpt_path, map_location="cpu")["state_dict"], strict=False)
+ return model
+
+
+def train(cfgs):
+ # create model
+ model = get_model(cfgs)
+
+ # create data
+ data_config = cfgs.pop("data", OmegaConf.create())
+ data_opt = data_config
+ data = create_data(data_opt)
+ data.prepare_data()
+ data.setup(stage='fit')
+
+ checkpoint_callback = ModelCheckpoint(every_n_epochs=cfgs.check_freq, save_top_k=-1)
+ imagelogger_callback = instantiate_from_config(cfgs.imagelogger_callback)
+ trainer = pl.Trainer(callbacks=[checkpoint_callback, imagelogger_callback], **cfgs.lightning)
+ trainer.fit(model=model, datamodule=data)
+
+if __name__ == "__main__":
+ config_path = 'configs/StyleTrain.yaml'
+ cfgs = OmegaConf.load(config_path)
+ train(cfgs)
\ No newline at end of file
diff --git a/src/model/TextGen/prestyle/trainer.py b/src/model/TextGen/prestyle/trainer.py
new file mode 100644
index 0000000..7d8fea5
--- /dev/null
+++ b/src/model/TextGen/prestyle/trainer.py
@@ -0,0 +1,351 @@
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from model import StyleEncoder, SpatialHead, GlyphHead
+import torchvision
+from pytorch_lightning import Callback
+from omegaconf import OmegaConf
+
+
+def count_params(model):
+ total_params = sum(p.numel() for p in model.parameters())
+ print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
+
+
+
+
+class StyleNet(pl.LightningModule):
+
+ def __init__(self, image_size=128, patch_size=16, in_chans=3, num_classes=0, embed_dim=768, trainable=False,
+ lr=1e-4, alpha_removal=1.0, alpha_seg=1.0, alpha_color=1.0,
+ alpha_font=1.0, enable_spatial_head=True, enable_glyph_head=True, *args, **kwargs):
+ super().__init__()
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.in_chans = in_chans
+ self.num_classes = num_classes
+ self.embed_dim = embed_dim
+ self.enable_spatial_head = enable_spatial_head
+ self.enable_glyph_head = enable_glyph_head
+
+
+
+ self.encoder = StyleEncoder(image_size=self.image_size, patch_size=self.patch_size, in_chans=self.in_chans,
+ embed_dim=self.embed_dim)
+ self.spatial_head = SpatialHead(image_size=self.image_size, patch_size=self.patch_size, in_chans=self.in_chans,
+ embed_dim=self.embed_dim)
+ self.glyph_head = GlyphHead(image_size=self.image_size, patch_size=self.patch_size, in_chans=self.in_chans,
+ embed_dim=self.embed_dim)
+
+ if trainable:
+ self.learning_rate = lr
+ self.alpha_removal = alpha_removal
+ self.alpha_seg = alpha_seg
+ self.alpha_color = alpha_color
+ self.alpha_font = alpha_font
+
+ self.count_params()
+
+
+ def count_params(self):
+ count_params(self.encoder)
+
+ count_params(self.spatial_head)
+
+ count_params(self.glyph_head)
+
+
+ def configure_optimizers(self):
+ lr = self.learning_rate
+ opt = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=lr)
+ return opt
+
+ def dice_loss(self, predictive, target, ep = 1e-8):
+ b, c, h, w = predictive.size()
+ predictive = predictive.view(b, -1)
+ target = target.view(b, -1)
+ intersection = 2 * torch.sum(predictive * target) + ep
+ union = torch.sum(predictive) + torch.sum(target) + ep
+ loss = 1 - intersection / union
+ return loss
+
+ def freeze(self):
+ for param in self.parameters():
+ param.requires_grad = False
+
+ def get_loss_full_supervised(self, batch, out_removal, out_seg, out_color, out_font):
+ if self.enable_spatial_head:
+ gt_removal = batch["bg"]
+ gt_seg = batch["seg"]
+
+ # 1.Text removal loss
+ lambda_inmask = 2.
+ lambda_outmask = 0.5
+ loss_removal = lambda_inmask * (nn.functional.l1_loss(out_removal * gt_seg, gt_removal * gt_seg)) + \
+ lambda_outmask * (nn.functional.l1_loss(out_removal * (1 - gt_seg), gt_removal * (1 - gt_seg)))
+
+ # 2.Text segmentation loss
+ loss_seg = self.dice_loss(out_seg, gt_seg)
+ else:
+ loss_removal = 0.
+ loss_seg = 0.
+
+
+ if self.enable_glyph_head:
+ gt_color = batch["c_r"]
+ gt_font = batch["f_r"]
+ # 3. Colorized loss
+ loss_color = nn.functional.mse_loss(out_color, gt_color)
+
+ # 4. Fontilized loss
+ loss_font = self.dice_loss(out_font, gt_font)
+
+ else:
+ loss_color = 0.
+ loss_font = 0.
+
+ return loss_removal, loss_seg, loss_color, loss_font
+
+ def get_loss_self_supervised(self, batch, out_removal, out_seg, out_color, out_font):
+ pass
+
+
+ def training_step(self, batch, batch_idx):
+ input = batch["i_s"]
+
+ img_spatial, img_glyph = self(input)
+ out_removal, out_seg = self.spatial_head(img_spatial)
+ out_color, out_font = self.glyph_head(img_glyph, batch["c_t"], batch["f_t"])
+
+ loss_removal, loss_seg, loss_color, loss_font = self.get_loss_full_supervised(batch, out_removal, out_seg,
+ out_color, out_font)
+ loss = self.alpha_removal * loss_removal + self.alpha_seg * loss_seg + \
+ self.alpha_color * loss_color + self.alpha_font * loss_font
+
+ loss_dict = {}
+ loss_dict["train_loss/loss_removal"] = loss_removal
+ loss_dict["train_loss/loss_seg"] = loss_seg
+ loss_dict["train_loss/loss_color"] = loss_color
+ loss_dict["train_loss/loss_font"] = loss_font
+ loss_dict["train_loss/full_loss"] = loss
+
+ self.log_dict(loss_dict, prog_bar=True, batch_size=len(batch["text_s"]),
+ logger=True, on_step=True, on_epoch=True, sync_dist=True)
+
+ return loss
+
+ def validation_step(self, batch, batch_idx):
+ input = batch["i_s"]
+
+ img_spatial, img_glyph = self(input)
+ out_removal, out_seg = self.spatial_head(img_spatial)
+ out_color, out_font = self.glyph_head(img_glyph, batch["c_t"], batch["f_t"])
+
+ loss_removal, loss_seg, loss_color, loss_font = self.get_loss_full_supervised(batch, out_removal, out_seg,
+ out_color, out_font)
+ loss = self.alpha_removal * loss_removal + self.alpha_seg * loss_seg + \
+ self.alpha_color * loss_color + self.alpha_font * loss_font
+
+ loss_dict = {}
+ loss_dict["val_loss/loss_removal"] = loss_removal
+ loss_dict["val_loss/loss_seg"] = loss_seg
+ loss_dict["val_loss/loss_color"] = loss_color
+ loss_dict["val_loss/loss_font"] = loss_font
+ loss_dict["val_loss/full_loss"] = loss
+
+ self.log_dict(loss_dict, prog_bar=True, batch_size=len(batch["text_s"]),
+ logger=True, on_step=True, on_epoch=True, sync_dist=True)
+
+ return loss
+
+ def forward(self, img):
+ img_spatial, img_glyph = self.encoder(img)
+ return img_spatial, img_glyph
+
+ @torch.no_grad()
+ def log_images(self, batch):
+ # generate result
+ input = batch["i_s"]
+ img_spatial, img_glyph = self(input)
+ out_removal, out_seg = self.spatial_head(img_spatial)
+ out_color, out_font = self.glyph_head(img_glyph, batch["c_t"], batch["f_t"])
+
+ # save result
+ _, _, h, w = batch["i_s"].shape
+ torch_resize = torchvision.transforms.Resize([h, w])
+ img_results = dict()
+ for i, caption in enumerate(batch["text_s"]):
+ source_image = batch["i_s"][i].cpu()
+ source_image = source_image.clamp(0., 1.)
+ source_image = torch_resize(source_image)
+
+ removal_image = out_removal[i].cpu()
+ removal_image = removal_image.clamp(0., 1.)
+ removal_image = torch_resize(removal_image)
+
+ seg_image = out_seg[i].cpu()
+ seg_image = seg_image.clamp(0., 1.)
+ seg_image = torch_resize(seg_image)
+ seg_image = seg_image.repeat(3, 1, 1)
+
+
+ color_gt = batch["c_r"][i].cpu()
+ color_gt = color_gt.clamp(0., 1.)
+ color_gt = torch_resize(color_gt)
+
+ color_image = out_color[i].cpu()
+ color_image = color_image.clamp(0., 1.)
+ color_image = torch_resize(color_image)
+
+
+ font_gt = batch["f_r"][i].cpu()
+ font_gt = font_gt.clamp(0., 1.)
+ font_gt = torch_resize(font_gt)
+
+ font_image = out_font[i].cpu()
+ font_image = font_image.clamp(0., 1.)
+ font_image = torch_resize(font_image)
+
+
+ img_results[f"{i}-{caption}"] = torch.cat(
+ [
+ source_image.unsqueeze(0),
+ removal_image.unsqueeze(0),
+ seg_image.unsqueeze(0),
+ color_gt.unsqueeze(0),
+ color_image.unsqueeze(0),
+ font_gt.unsqueeze(0),
+ font_image.unsqueeze(0),
+ ], dim=0
+ )
+ return img_results
+
+
+
+#########################################################################
+# Image logger
+#########################################################################
+from pytorch_lightning.loggers import WandbLogger, TensorBoardLogger
+from torchvision.utils import make_grid
+from pytorch_lightning.utilities import rank_zero_only
+import os
+from torchvision.transforms import ToPILImage
+
+class StyleNetImageLogger(Callback):
+ def __init__(
+ self,
+ train_batch_frequency,
+ valid_batch_frequency,
+ generation_kwargs,
+ metric_callback=None,
+ disable_wandb=False,
+ ):
+ super().__init__()
+ self.batch_freq = {
+ "train": train_batch_frequency,
+ "valid": valid_batch_frequency,
+ }
+ self.generation_kwargs = OmegaConf.to_container(generation_kwargs)
+ self.metric_callback = metric_callback
+ self.disable_wandb = disable_wandb
+
+ def get_log_dir(self, pl_module):
+ if isinstance(pl_module.logger, WandbLogger):
+ return pl_module.logger.experiment.dir
+ elif isinstance(pl_module.logger, TensorBoardLogger):
+ return pl_module.logger.log_dir
+
+ def logger_log_image(self, pl_module, captions, images, global_step, split):
+ if isinstance(pl_module.logger, WandbLogger) and not self.disable_wandb:
+ pl_module.logger.log_image(
+ key=f"{split}_img/{global_step}",
+ images=images,
+ caption=captions,
+ step=global_step,
+ )
+ elif isinstance(pl_module.logger, TensorBoardLogger):
+ big_grid = make_grid(
+ torch.stack(images),
+ padding=3,
+ pad_value=50,
+ )
+ pl_module.logger.experiment.add_image(
+ f"{split}_img",
+ big_grid,
+ global_step=global_step,
+ )
+ pl_module.logger.experiment.add_text(
+ f"{split}_img_caption",
+ " | ".join(captions),
+ global_step=global_step,
+ )
+
+
+ @rank_zero_only
+ def save_image(self, pl_module, images, global_step, split):
+ print(f"Log images at: {split}/{global_step}")
+ all_image_grids = []
+ all_captions = []
+ for k in images:
+ grid = make_grid(images[k])
+ all_captions.append(f"{k}")
+ all_image_grids.append(grid)
+
+ path = os.path.join(
+ self.get_log_dir(pl_module), split + "_img", str(global_step)
+ )
+ os.makedirs(path, exist_ok=True)
+ for caption, grid in zip(all_captions, all_image_grids):
+ img = ToPILImage()(grid)
+ img.save(os.path.join(path, caption + ".png"))
+
+ self.logger_log_image(
+ pl_module, all_captions,
+ all_image_grids, global_step, split
+ )
+
+ def log_img(self, pl_module, batch, batch_idx, split="train"):
+ if (
+ self.check_frequency(batch_idx, split)
+ and hasattr(pl_module, "log_images")
+ and callable(pl_module.log_images)
+ ):
+ is_train = pl_module.training
+ if is_train:
+ pl_module.eval()
+
+ with torch.no_grad():
+ generation_samples = pl_module.log_images(batch)
+
+ image_results = generation_samples
+ self.save_image(pl_module, image_results,
+ pl_module.global_step, split)
+ if is_train:
+ pl_module.train()
+
+ return generation_samples
+
+ def check_frequency(self, batch_idx, split="train"):
+ if split == "train":
+ if ((batch_idx + 1) % self.batch_freq[split]) == 0: # avoid batch 0
+ return True
+ else:
+ if (batch_idx % self.batch_freq[split.split("_")[0]]) == 0:
+ return True
+ return False
+
+ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+ self.generation_kwargs["generator"] = torch.Generator(
+ device=pl_module.device
+ ).manual_seed(self.generation_kwargs["seed"])
+ self.log_img(pl_module, batch, batch_idx, split="train")
+
+ def on_validation_batch_end(
+ self, trainer, pl_module, outputs, batch, batch_idx, dataloadr_idx
+ ):
+ if trainer.state.stage == "sanity_check":
+ return
+ self.generation_kwargs["generator"] = torch.Generator(
+ device=pl_module.device
+ ).manual_seed(self.generation_kwargs["seed"])
+ self.log_img(pl_module, batch, batch_idx, split="valid")
\ No newline at end of file
diff --git a/src/model/TextGen/src/MuSA/GaMuSA.py b/src/model/TextGen/src/MuSA/GaMuSA.py
new file mode 100644
index 0000000..bee982c
--- /dev/null
+++ b/src/model/TextGen/src/MuSA/GaMuSA.py
@@ -0,0 +1,268 @@
+import torch
+import numpy as np
+import torch.nn as nn
+from tqdm import tqdm
+from src.module.abinet.modules.model_vision import BaseVision
+import torchvision.transforms as T
+from omegaconf import OmegaConf
+from src.module.abinet import CharsetMapper
+
+
+class GaMuSA:
+ def __init__(self, model, monitor_cfg):
+ self.model = model
+ self.scheduler = model.noise_scheduler
+ self.device = model.device
+ self.unet = model.unet
+ self.vae = model.vae
+ self.control_model = model.control_model
+ self.text_encoder = model.text_encoder
+ self.NORMALIZER = model.NORMALIZER
+ monitor_cfg = OmegaConf.create(monitor_cfg)
+ self.monitor = BaseVision(monitor_cfg).to(self.device)
+ self.charset = CharsetMapper(filename=monitor_cfg.charset_path)
+ self.max_length = monitor_cfg.max_length + 1
+
+ def next_step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ x: torch.FloatTensor,
+ eta=0.,
+ verbose=False
+ ):
+ """
+ Inverse sampling for DDIM Inversion
+ """
+ if verbose:
+ print("timestep: ", timestep)
+ next_step = timestep
+ timestep = min(timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps, 999)
+ alpha_prod_t = self.scheduler.alphas_cumprod[timestep] if timestep >= 0 else self.scheduler.final_alpha_cumprod
+ alpha_prod_t_next = self.scheduler.alphas_cumprod[next_step]
+ beta_prod_t = 1 - alpha_prod_t
+ pred_x0 = (x - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
+ pred_dir = (1 - alpha_prod_t_next) ** 0.5 * model_output
+ x_next = alpha_prod_t_next ** 0.5 * pred_x0 + pred_dir
+ return x_next, pred_x0
+
+ def step(
+ self,
+ model_output: torch.FloatTensor,
+ timestep: int,
+ x: torch.FloatTensor,
+ eta: float=0.0,
+ verbose=False,
+ ):
+ """
+ predict the next step in the denoise process.
+ """
+ prev_timestep = timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+ alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+ alpha_prod_t_prev = self.scheduler.alphas_cumprod[prev_timestep] if prev_timestep > 0 else self.scheduler.final_alpha_cumprod
+ beta_prod_t = 1 - alpha_prod_t
+ pred_x0 = (x - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+ pred_dir = (1 - alpha_prod_t_prev)**0.5 * model_output
+ x_prev = alpha_prod_t_prev**0.5 * pred_x0 + pred_dir
+ return x_prev, pred_x0
+
+ @torch.no_grad()
+ def image2latent(self, image):
+ with torch.no_grad():
+ assert image.dim() == 4, print("input dims should be 4 !")
+ latents = self.vae.encode(image.to(self.device)).latent_dist.sample()
+ latents = latents * self.NORMALIZER
+ return latents
+
+ @torch.no_grad()
+ def latent2image(self, latents, return_type='np'):
+ latents = 1 / self.NORMALIZER * latents.detach()
+ image = self.model.vae.decode(latents).sample
+ if return_type == 'np':
+ image = image.clamp(0, 1)
+ image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+ image = (image * 255).astype(np.uint8)
+ return image
+
+ def latent2image_grad(self, latents):
+ latents = 1 / self.NORMALIZER * latents
+ image = self.vae.decode(latents).sample
+ return image
+
+ @torch.no_grad()
+ def inversion(
+ self,
+ image: torch.Tensor,
+ hint: torch.Tensor,
+ cond,
+ num_inference_steps=50,
+ guidance_scale=7.5,
+ eta=0.0,
+ return_intermediates=False,
+ **kwds):
+ """
+ invert a real image into noise map with determinisc DDIM inversion
+ """
+
+ assert image.shape[0] == len(cond), print("Unequal batch size for image and cond.")
+ assert image.shape[0] == hint.shape[0], print("Unequal batch size for image and hint.")
+
+ cond_embeddings = self.model.get_text_conditioning(cond)
+ if guidance_scale > 1.:
+ uncond = [""] * len(cond)
+ uncond_embeddings = self.model.get_text_conditioning(uncond)
+ text_embeddings = torch.cat([uncond_embeddings, cond_embeddings], dim=0)
+ else:
+ text_embeddings = cond_embeddings
+
+ latents = self.image2latent(image)
+ start_latents = latents
+
+
+ self.scheduler.set_timesteps(num_inference_steps)
+ latents_list = [latents]
+ pred_x0_list = [latents]
+ for i, t in enumerate(reversed(self.scheduler.timesteps)):
+ if guidance_scale > 1.:
+ model_inputs = torch.cat([latents] * 2)
+ hint_input = torch.cat([hint] * 2)
+ else:
+ model_inputs = latents
+ hint_input = hint
+
+ control_input = self.control_model(hint_input, model_inputs, t, text_embeddings)
+
+ noise_pred = self.unet(
+ x=model_inputs,
+ timestep=t,
+ encoder_hidden_states=text_embeddings,
+ control=control_input
+ ).sample
+ if guidance_scale > 1.:
+ noise_pred_uncon, noise_pred_con = noise_pred.chunk(2, dim=0)
+ noise_pred = noise_pred_uncon + guidance_scale * (noise_pred_con - noise_pred_uncon)
+ latents, pred_x0 = self.next_step(noise_pred, t, latents)
+ latents_list.append(latents)
+ pred_x0_list.append(pred_x0)
+ if return_intermediates:
+ return latents, latents_list
+ return latents, start_latents
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ hint,
+ cond,
+ start_step=24,
+ start_layer=10,
+ batch_size=1,
+ height=256,
+ width=256,
+ num_inference_steps=50,
+ guidance_scale=2,
+ eta=0.0,
+ latents=None,
+ unconditioning=None,
+ neg_prompt=None,
+ ref_intermediate_latents=None,
+ return_intermediates=False,
+ enable_GaMuSA=True,
+ **kwds):
+
+ cond_embeddings = self.model.get_text_conditioning(cond)
+ if guidance_scale > 1.:
+ uncond = [""] * len(cond)
+ uncond_embeddings = self.model.get_text_conditioning(uncond)
+ text_embeddings = torch.cat([uncond_embeddings, cond_embeddings], dim=0)
+ else:
+ text_embeddings = cond_embeddings
+
+ batch_size = len(cond)
+ latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
+ if latents is None:
+ latents = torch.randn(latents_shape, device=self.device)
+ else:
+ pass
+
+ self.scheduler.set_timesteps(num_inference_steps)
+ latents_list = [latents]
+ pred_x0_list = [latents]
+
+ gt_ids, _ = prepare_label(cond, self.charset, self.max_length, self.device)
+ from src.MuSA.utils import MuSA_TextCtrl
+ from src.MuSA.utils import regiter_attention_editor_diffusers_Edit
+ if enable_GaMuSA:
+ controller = MuSA_TextCtrl(start_step, start_layer)
+ regiter_attention_editor_diffusers_Edit(self.unet, controller)
+ controller.start_ctrl()
+
+ for i, t in enumerate(self.scheduler.timesteps):
+ if ref_intermediate_latents is not None:
+ latents_ref = ref_intermediate_latents[-1 - i]
+ _, latents_cur = latents.chunk(2)
+ latents = torch.cat([latents_ref, latents_cur])
+
+ if guidance_scale > 1.:
+ model_inputs = torch.cat([latents] * 2)
+ hint_input = torch.cat([hint] * 2)
+ else:
+ model_inputs = latents
+ hint_input = hint
+
+ control_input = self.control_model(hint_input, model_inputs, t, text_embeddings)
+
+ if unconditioning is not None and isinstance(unconditioning, list):
+ _, text_embeddings = text_embeddings.chunk(2)
+ text_embeddings = torch.cat([unconditioning[i].expand(*text_embeddings.shape), text_embeddings])
+ noise_pred = self.unet(
+ x=model_inputs,
+ timestep=t,
+ encoder_hidden_states=text_embeddings,
+ control=control_input,
+ ).sample
+ if guidance_scale > 1.:
+ noise_pred_uncon, noise_pred_con = noise_pred.chunk(2, dim=0)
+ noise_pred = noise_pred_uncon + guidance_scale * (noise_pred_con - noise_pred_uncon)
+ latents, pred_x0 = self.step(noise_pred, t, latents)
+
+ if enable_GaMuSA:
+ glyph_resize = T.transforms.Resize([32, 128])
+ if (i + 1) % 5 == 0:
+ glyph_inputs = self.latent2image(latents, return_type="pt")
+ glyph_inputs = glyph_resize(glyph_inputs)
+ outputs = self.monitor(glyph_inputs)
+ cosine_score = glyph_cosine_similarity(outputs, gt_ids)
+ controller.reset_alpha(cosine_score)
+ if enable_GaMuSA:
+ controller.reset_ctrl()
+ controller.reset()
+ image = self.latent2image(latents, return_type="pt")
+ if return_intermediates:
+ pred_x0_list = [self.latent2image(img, return_type="pt") for img in pred_x0_list]
+ latents_list = [self.latent2image(img, return_type="pt") for img in latents_list]
+ return image, latents_list, pred_x0_list
+ return image
+
+
+
+def glyph_cosine_similarity(output, gt_labels):
+ pt_logits = nn.Softmax(dim=2)(output['logits'])
+ assert pt_logits.shape[0] == gt_labels.shape[0]
+ assert pt_logits.shape[2] == gt_labels.shape[2]
+ score = nn.CosineSimilarity(dim=2, eps=1e-6)(pt_logits, gt_labels)
+ mean_score = torch.mean(score, dim=1)
+ return mean_score
+
+def prepare_label(labels, charset, max_length, device):
+ gt_ids = []
+ gt_lengths = []
+ for label in labels:
+ length = torch.tensor(max_length, dtype=torch.long)
+ label = charset.get_labels(label, length=max_length, padding=True, case_sensitive=False)
+ label = torch.tensor(label, dtype=torch.long)
+ label = CharsetMapper.onehot(label, charset.num_classes)
+ gt_ids.append(label)
+ gt_lengths.append(length)
+ gt_ids = torch.stack(gt_ids).to(device)
+ gt_lengths = torch.stack(gt_lengths).to(device)
+ return gt_ids, gt_lengths
\ No newline at end of file
diff --git a/src/model/TextGen/src/MuSA/GaMuSA_app.py b/src/model/TextGen/src/MuSA/GaMuSA_app.py
new file mode 100644
index 0000000..a4ddcfc
--- /dev/null
+++ b/src/model/TextGen/src/MuSA/GaMuSA_app.py
@@ -0,0 +1,31 @@
+import torch
+
+def text_editing(model, source_image, style_image, style_text, target_text,
+ starting_layer=10, ddim_steps=50, scale=2):
+
+ device = model.device
+ with torch.no_grad():
+ prompts = [style_text, target_text]
+ inversion_prompt = [style_text]
+ start_code, latents_list = model.inversion(source_image,
+ style_image,
+ inversion_prompt,
+ guidance_scale=scale,
+ num_inference_steps=ddim_steps,
+ return_intermediates=True)
+ start_code = start_code.expand(len(prompts), -1, -1, -1)
+ image_GaMuSA = model(
+ style_image.expand(len(prompts), -1, -1, -1),
+ prompts,
+ num_inference_steps=ddim_steps,
+ latents=start_code,
+ guidance_scale=scale,
+ enable_GaMuSA=True
+ )
+ image_GaMuSA = image_GaMuSA.clamp(0, 1)
+ image_GaMuSA = image_GaMuSA.cpu().permute(0, 2, 3, 1).numpy()
+
+ return [
+ image_GaMuSA[0],
+ image_GaMuSA[1],
+ ]
diff --git a/src/model/TextGen/src/MuSA/utils.py b/src/model/TextGen/src/MuSA/utils.py
new file mode 100644
index 0000000..4c9d157
--- /dev/null
+++ b/src/model/TextGen/src/MuSA/utils.py
@@ -0,0 +1,198 @@
+import os
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import sys
+from typing import Optional, Union, Tuple, List, Callable, Dict
+
+from torchvision.utils import save_image
+from einops import rearrange, repeat
+
+class AttentionBase:
+ def __init__(self):
+ self.cur_step = 0
+ self.num_att_layers = -1
+ self.cur_att_layer = 0
+ def after_step(self):
+ pass
+
+ def __call__(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+ out = self.forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+ self.cur_att_layer += 1
+ if self.cur_att_layer == self.num_att_layers:
+ self.cur_att_layer = 0
+ self.cur_step += 1
+ self.after_step()
+ return out
+
+ def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+ out = torch.einsum('b i j, b j d -> b i d', attn, v)
+ out = rearrange(out, '(b h) n d -> b n (h d)', h=num_heads)
+ return out
+
+ def reset(self):
+ self.cur_step = 0
+ self.cur_att_layer = 0
+
+def regiter_attention_editor_diffusers_Edit(model, editor: AttentionBase):
+ def ca_forward(self, place_in_unet):
+ def forward(x, encoder_hidden_states=None, attention_mask=None, context=None, mask=None):
+ if encoder_hidden_states is not None:
+ context = encoder_hidden_states
+ if attention_mask is not None:
+ mask = attention_mask
+
+ to_out = self.to_out
+ if isinstance(to_out, nn.modules.container.ModuleList):
+ to_out = self.to_out[0]
+ else:
+ to_out = self.to_out
+
+ h = self.heads
+ q = self.to_q(x)
+ is_cross = context is not None
+ context = context if is_cross else x
+ k = self.to_k(context)
+ v = self.to_v(context)
+ q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+
+ sim = torch.einsum('b i d, b j d -> b i j', q, k) * self.scale
+
+ if mask is not None:
+ mask = rearrange(mask, 'b ... -> b (...)')
+ max_neg_value = -torch.finfo(sim.dtype).max
+ mask = repeat(mask, 'b j -> (b h) () j', h=h)
+ mask = mask[:, None, :].repeat(h, 1, 1)
+ sim.masked_fill_(~mask, max_neg_value)
+
+ attn = sim.softmax(dim=-1)
+ out = editor(
+ q, k, v, sim, attn, is_cross, place_in_unet,
+ self.heads, scale=self.scale)
+
+ return to_out(out)
+
+ return forward
+
+ def register_editor(net, count, place_in_unet):
+ for name, subnet in net.named_children():
+ if net.__class__.__name__ == 'CrossAttention':
+ net.forward = ca_forward(net, place_in_unet)
+ return count + 1
+ elif hasattr(net, 'children'):
+ count = register_editor(subnet, count, place_in_unet)
+ return count
+
+ cross_att_count = 0
+ for net_name, net in model.named_children():
+ if "down" in net_name:
+ cross_att_count += register_editor(net, 0, "down")
+ elif "mid" in net_name:
+ cross_att_count += register_editor(net, 0, "mid")
+ elif "up" in net_name:
+ cross_att_count += register_editor(net, 0, "up")
+ editor.num_att_layers = cross_att_count
+
+
+
+class MutualSelfAttentionControl(AttentionBase):
+ MODEL_TYPE = {
+ "SD": 16,
+ "SDXL": 70
+ }
+
+ def __init__(self, start_step=4, start_layer=10, layer_idx=None, step_idx=None, total_steps=50, model_type="SD"):
+ super().__init__()
+ self.total_steps = total_steps
+ self.total_layers = self.MODEL_TYPE.get(model_type, 16)
+ self.start_step = start_step
+ self.start_layer = start_layer
+ self.layer_idx = layer_idx if layer_idx is not None else list(range(start_layer, self.total_layers))
+ self.step_idx = step_idx if step_idx is not None else list(range(start_step, total_steps))
+
+ def attn_batch(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+ b = q.shape[0] // num_heads
+ q = rearrange(q, "(b h) n d -> h (b n) d", h=num_heads)
+ k = rearrange(k, "(b h) n d -> h (b n) d", h=num_heads)
+ v = rearrange(v, "(b h) n d -> h (b n) d", h=num_heads)
+
+ sim = torch.einsum("h i d, h j d -> h i j", q, k) * kwargs.get("scale")
+ attn = sim.softmax(-1)
+ out = torch.einsum("h i j, h j d -> h i d", attn, v)
+ out = rearrange(out, "h (b n) d -> b n (h d)", b=b)
+ return out
+
+ def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+ if is_cross or self.cur_step not in self.step_idx or self.cur_att_layer // 2 not in self.layer_idx:
+ return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+ qu, qc = q.chunk(2)
+ ku, kc = k.chunk(2)
+ vu, vc = v.chunk(2)
+ attnu, attnc = attn.chunk(2)
+
+ out_u = self.attn_batch(qu, ku[:num_heads], vu[:num_heads], sim[:num_heads], attnu, is_cross, place_in_unet, num_heads, **kwargs)
+ out_c = self.attn_batch(qc, kc[:num_heads], vc[:num_heads], sim[:num_heads], attnc, is_cross, place_in_unet, num_heads, **kwargs)
+ out = torch.cat([out_u, out_c], dim=0)
+
+ return out
+
+
+class MuSA_TextCtrl(MutualSelfAttentionControl):
+ def __init__(self, start_step=4, start_layer=10, layer_idx=None, step_idx=None, total_steps=50, model_type="SD"):
+ super().__init__(start_step, start_layer, layer_idx, step_idx, total_steps, model_type)
+ self.alpha_flag = False
+ self.alpha = None
+ self.no_ctrl = True
+ def reset_alpha(self, cosine_score):
+ self.alpha_flag = True
+ _, self.alpha = cosine_score.chunk(2)
+ self.alpha = self.alpha.unsqueeze(1).unsqueeze(1)
+
+ def reset_ctrl(self):
+ self.no_ctrl = True
+
+ def start_ctrl(self):
+ self.no_ctrl = False
+
+ def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+
+ if self.no_ctrl or is_cross or self.cur_step not in self.step_idx or self.cur_att_layer // 2 not in self.layer_idx:
+ is_cross = True
+ return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+
+ qu_s, qu_t, qc_s, qc_t = q.chunk(4)
+ ku_s, ku_t, kc_s, kc_t = k.chunk(4)
+ vu_s, vu_t, vc_s, vc_t = v.chunk(4)
+ attnu_s, attnu_t, attnc_s, attnc_t = attn.chunk(4)
+
+ out_u_s = self.attn_batch(qu_s, ku_s, vu_s, sim, attnu_s, is_cross, place_in_unet, num_heads, **kwargs)
+ out_c_s = self.attn_batch(qc_s, kc_s, vc_s, sim, attnc_s, is_cross, place_in_unet, num_heads, **kwargs)
+
+ if self.alpha_flag:
+ B, N, D = ku_s.shape
+ score = self.alpha.detach().clone()
+ score = score.repeat(1, N, D * num_heads)
+ score = rearrange(score, 'b n (h d) -> (b h) n d', h=num_heads)
+ ku_t_fusion = score * ku_s + (1 - score) * ku_t
+ vu_t_fusion = score * vu_s + (1 - score) * vu_t
+ kc_t_fusion = score * kc_s + (1 - score) * kc_t
+ vc_t_fusion = score * vc_s + (1 - score) * vc_t
+ else:
+ ku_t_fusion = ku_t
+ vu_t_fusion = vu_t
+ kc_t_fusion = kc_t
+ vc_t_fusion = vc_t
+
+ out_u_t = self.attn_batch(qu_t, ku_t_fusion, vu_t_fusion, sim[:num_heads], attnu_t,
+ is_cross, place_in_unet, num_heads, **kwargs)
+ out_c_t = self.attn_batch(qc_t, kc_t_fusion, vc_t_fusion, sim[:num_heads], attnc_t,
+ is_cross, place_in_unet, num_heads, **kwargs)
+
+ out = torch.cat([out_u_s, out_u_t, out_c_s, out_c_t], dim=0)
+
+ return out
+
+
+
diff --git a/src/model/TextGen/src/dataset/textdata.py b/src/model/TextGen/src/dataset/textdata.py
new file mode 100644
index 0000000..a35f5cf
--- /dev/null
+++ b/src/model/TextGen/src/dataset/textdata.py
@@ -0,0 +1,147 @@
+import json
+import random
+
+import cv2
+import numpy as np
+import os
+from torch.utils.data import Dataset
+from torchvision import transforms
+from PIL import Image
+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+import torchvision.transforms as T
+
+class WrappedDataModule(pl.LightningDataModule):
+ def __init__(self, data_config, **kwargs):
+ super().__init__()
+ self.save_hyperparameters()
+ self.config = data_config
+ self.batch_size = data_config.batch_size
+
+ def setup(self, stage: str):
+ if stage == "fit":
+ self.train = TextDataset(self.config.train)
+ self.val = TextDataset(self.config.validation)
+ if stage == "test" or stage == "predict":
+ self.val = TextDataset(self.config.test)
+
+ def train_dataloader(self):
+ return DataLoader(
+ self.train,
+ batch_size=self.batch_size,
+ shuffle=True,
+ num_workers=8,
+ )
+
+ def val_dataloader(self):
+ return DataLoader(
+ self.val,
+ batch_size=self.batch_size,
+ shuffle=False,
+ num_workers=8,
+ )
+
+
+class TextDataset(Dataset):
+ def __init__(self, config):
+ super().__init__()
+ self.size = config.size
+ self.data_dir_list = []
+ for dir_name in os.listdir(config.root_dir):
+ self.data_dir_list.append(os.path.join(config.root_dir, dir_name))
+ self.font_paths = [os.path.join(config.font_dir, font_name) for font_name in os.listdir(config.font_dir)]
+
+ # source image
+ self.i_s = []
+ # target image
+ self.t_f = []
+
+ self.text_s = []
+ self.text_t = []
+
+ for data_dir in self.data_dir_list:
+ tmp_names = []
+ with open(os.path.join(data_dir, 'i_s.txt'), 'r') as f_s:
+ lines = f_s.readlines()
+ for line in lines:
+ name, text = line.split(' ')[:]
+ self.text_s.append(text)
+ tmp_names.append(name)
+ with open(os.path.join(data_dir, 'i_t.txt'), 'r') as f_t:
+ lines = f_t.readlines()
+ for line in lines:
+ _, text = line.split(' ')[:]
+ self.text_t.append(text)
+
+ for tmp_name in tmp_names:
+ self.i_s.append(os.path.join(data_dir, 'i_s', tmp_name))
+ self.t_f.append(os.path.join(data_dir, 't_f', tmp_name))
+
+ self.transform = T.Compose([
+ T.Resize((self.size, self.size)),
+ T.ToTensor(),
+ ])
+ print(f"In all collected {len(self.i_s)} sample")
+
+
+ def __len__(self):
+ return len(self.i_s)
+
+ def __getitem__(self, index):
+ img_i_s = Image.open(self.i_s[index]).convert("RGB")
+ i_s = self.transform(img_i_s)
+ img_t_f = Image.open(self.t_f[index]).convert("RGB")
+ t_f = self.transform(img_t_f)
+ text_s = self.text_s[index].strip()
+ text_t = self.text_t[index].strip()
+
+ if random.random() < 0.1:
+ cond = ""
+ else:
+ cond = text_t
+
+ return dict(img=t_f, texts=text_t, cond=cond, hint=i_s)
+
+
+class InferDataset(Dataset):
+ def __init__(self, data_dir, size=256):
+ super().__init__()
+ self.size = size
+ self.i_s = []
+ self.text_s = []
+ self.text_t = []
+
+ tmp_names = []
+ with open(os.path.join(data_dir, 'i_s.txt'), 'r') as f_s:
+ lines = f_s.readlines()
+ for line in lines:
+ name, text = line.split(' ')[:]
+ self.text_s.append(text)
+ tmp_names.append(name)
+ with open(os.path.join(data_dir, 'i_t.txt'), 'r') as f_t:
+ lines = f_t.readlines()
+ for line in lines:
+ _, text = line.split(' ')[:]
+ self.text_t.append(text)
+
+ for tmp_name in tmp_names:
+ self.i_s.append(os.path.join(data_dir, 'i_s', tmp_name))
+
+ self.transform = T.Compose([
+ T.Resize((self.size, self.size)),
+ T.ToTensor(),
+ ])
+ print(f"In all collected {len(self.i_s)} inference data")
+
+ def __len__(self):
+ return len(self.i_s)
+
+ def __getitem__(self, index):
+ name = self.i_s[index].rsplit("/", maxsplit=1)[1]
+
+ img_i_s = Image.open(self.i_s[index]).convert("RGB")
+ i_s = self.transform(img_i_s)
+ text_s = self.text_s[index].strip()
+ text_t = self.text_t[index].strip()
+ cond = text_t
+ return dict(img=i_s, texts=text_t, cond=cond, hint=i_s, names=name)
diff --git a/src/model/TextGen/src/module/abinet/__init__.py b/src/model/TextGen/src/module/abinet/__init__.py
new file mode 100644
index 0000000..1d61bcf
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/__init__.py
@@ -0,0 +1,4 @@
+from .abinet_base import get_model, preprocess, postprocess, load, create_ocr_model
+from .utils import CharsetMapper, prepare_label
+from .modules.model_abinet_iter import ABINetIterModel, ABINetIterModelWrapper
+from .modules.losses import MultiLosses
diff --git a/src/model/TextGen/src/module/abinet/abinet_base.py b/src/model/TextGen/src/module/abinet/abinet_base.py
new file mode 100644
index 0000000..03eeafc
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/abinet_base.py
@@ -0,0 +1,107 @@
+import logging
+import os
+import glob
+import torch
+import PIL
+import torch.nn.functional as F
+from torchvision import transforms
+from omegaconf import OmegaConf
+from .utils import Config, CharsetMapper
+
+
+BASE_DIR = "src/abinet/"
+DEFAULT_OCR_CONFIG = {
+ "conf": os.path.join(BASE_DIR, "configs/train_abinet.yaml"),
+ "default_conf": os.path.join(BASE_DIR, "configs/template.yaml"),
+ "ckpt": os.path.join(BASE_DIR, "checkpoints/abinet/train-abinet/best-train-abinet.pth"),
+}
+
+
+def create_ocr_model(device=None):
+ print("Loading OCR model...")
+ if device is None:
+ device = torch.cuda.current_device()
+ default_conf = OmegaConf.load(DEFAULT_OCR_CONFIG["default_conf"])
+ conf = OmegaConf.load(DEFAULT_OCR_CONFIG["conf"])
+ config = OmegaConf.merge(default_conf, conf)
+ OmegaConf.resolve(config)
+ charset = CharsetMapper(
+ filename=config.dataset.charset_path, max_length=config.dataset.max_length + 1
+ )
+ config.model_eval = "alignment"
+ ocr_model = get_model(config.model)
+ model = load(
+ ocr_model,
+ DEFAULT_OCR_CONFIG["ckpt"],
+ device=None,
+ strict="Contrast" not in config.model.name,
+ ) # always load to cpu first
+ model = model.to(device)
+ print("OCR Model loaded")
+ return charset, ocr_model
+
+
+def get_model(config, device="cpu", reload=False):
+ import importlib
+
+ module, cls = config.name.rsplit(".", 1)
+ if reload:
+ module_imp = importlib.import_module(module)
+ importlib.reload(module_imp)
+ cls = getattr(importlib.import_module(module, package=None), cls)
+
+ model = cls(config)
+ logging.info(model)
+ model = model.eval()
+ return model
+
+
+def preprocess(img, width=128, height=32):
+ img = img.resize((width, height), PIL.Image.Resampling.BILINEAR)
+ img = transforms.ToTensor()(img).unsqueeze(0)
+ mean = torch.tensor([0.485, 0.456, 0.406])
+ std = torch.tensor([0.229, 0.224, 0.225])
+ return (img - mean[..., None, None]) / std[..., None, None]
+
+def postprocess(output, charset, model_eval):
+ def _get_output(last_output, model_eval):
+ if isinstance(last_output, (tuple, list)):
+ for res in last_output:
+ if res["name"] == model_eval:
+ output = res
+ else:
+ output = last_output
+ return output
+
+ def _decode(logit):
+ """Greed decode"""
+ out = F.softmax(logit, dim=2)
+ pt_text, pt_scores, pt_lengths = [], [], []
+ for o in out:
+ text = charset.get_text(o.argmax(dim=1), padding=False, trim=False)
+ text = text.split(charset.null_char)[0] # end at end-token
+ pt_text.append(text)
+ pt_scores.append(o.max(dim=1)[0])
+ pt_lengths.append(
+ min(len(text) + 1, charset.max_length)
+ ) # one for end-token
+ return pt_text, pt_scores, pt_lengths
+
+ output = _get_output(output, model_eval)
+ logits, pt_lengths = output["logits"], output["pt_lengths"]
+ pt_text, pt_scores, pt_lengths_ = _decode(logits)
+ return pt_text, pt_scores, pt_lengths_
+
+
+def load(model, file, device=None, strict=True):
+ if device is None:
+ device = "cpu"
+ elif isinstance(device, int):
+ device = torch.device("cuda", device)
+
+ assert os.path.isfile(file)
+ state = torch.load(file, map_location=device)
+ if set(state.keys()) == {"model", "opt"}:
+ state = state["model"]
+ model.load_state_dict(state, strict=strict)
+ return model
diff --git a/src/model/TextGen/src/module/abinet/configs/pretrain_language_model.yaml b/src/model/TextGen/src/module/abinet/configs/pretrain_language_model.yaml
new file mode 100644
index 0000000..b8d4a04
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/configs/pretrain_language_model.yaml
@@ -0,0 +1,45 @@
+global:
+ name: pretrain-language-model
+ phase: train
+ stage: pretrain-language
+ workdir: workdir
+ seed: ~
+
+dataset:
+ train: {
+ roots: ['data/WikiText-103.csv'],
+ batch_size: 4096
+ }
+ test: {
+ roots: ['data/WikiText-103_eval_d1.csv'],
+ batch_size: 4096
+ }
+
+training:
+ epochs: 80
+ show_iters: 50
+ eval_iters: 6000
+ save_iters: 3000
+
+optimizer:
+ type: Adam
+ true_wd: False
+ wd: 0.0
+ bn_wd: False
+ clip_grad: 20
+ lr: 0.0001
+ args: {
+ betas: !!python/tuple [0.9, 0.999], # for default Adam
+ }
+ scheduler: {
+ periods: [70, 10],
+ gamma: 0.1,
+ }
+
+model:
+ name: 'abinet.modules.model_language.BCNLanguage'
+ language: {
+ num_layers: 4,
+ loss_weight: 1.,
+ use_self_attn: False
+ }
diff --git a/src/model/TextGen/src/module/abinet/configs/pretrain_vision_model.yaml b/src/model/TextGen/src/module/abinet/configs/pretrain_vision_model.yaml
new file mode 100644
index 0000000..ad0dd4a
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/configs/pretrain_vision_model.yaml
@@ -0,0 +1,58 @@
+global:
+ name: pretrain-vision-model
+ phase: train
+ stage: pretrain-vision
+ workdir: workdir
+ seed: ~
+
+dataset:
+ train: {
+ roots: ['data/training/MJ/MJ_train/',
+ 'data/training/MJ/MJ_test/',
+ 'data/training/MJ/MJ_valid/',
+ 'data/training/ST'],
+ batch_size: 384
+ }
+ test: {
+ roots: ['data/evaluation/IIIT5k_3000',
+ 'data/evaluation/SVT',
+ 'data/evaluation/SVTP',
+ 'data/evaluation/IC13_857',
+ 'data/evaluation/IC15_1811',
+ 'data/evaluation/CUTE80'],
+ batch_size: 384
+ }
+ data_aug: True
+ multiscales: False
+ num_workers: 14
+
+training:
+ epochs: 8
+ show_iters: 50
+ eval_iters: 3000
+ save_iters: 3000
+
+optimizer:
+ type: Adam
+ true_wd: False
+ wd: 0.0
+ bn_wd: False
+ clip_grad: 20
+ lr: 0.0001
+ args: {
+ betas: !!python/tuple [0.9, 0.999], # for default Adam
+ }
+ scheduler: {
+ periods: [6, 2],
+ gamma: 0.1,
+ }
+
+model:
+ name: 'abinet.modules.model_vision.BaseVision'
+ checkpoint: ~
+ vision: {
+ loss_weight: 1.,
+ attention: 'position',
+ backbone: 'transformer',
+ backbone_ln: 3,
+ }
diff --git a/src/model/TextGen/src/module/abinet/configs/pretrain_vision_model_sv.yaml b/src/model/TextGen/src/module/abinet/configs/pretrain_vision_model_sv.yaml
new file mode 100644
index 0000000..daea7b5
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/configs/pretrain_vision_model_sv.yaml
@@ -0,0 +1,58 @@
+global:
+ name: pretrain-vision-model-sv
+ phase: train
+ stage: pretrain-vision
+ workdir: workdir
+ seed: ~
+
+dataset:
+ train: {
+ roots: ['data/training/MJ/MJ_train/',
+ 'data/training/MJ/MJ_test/',
+ 'data/training/MJ/MJ_valid/',
+ 'data/training/ST'],
+ batch_size: 384
+ }
+ test: {
+ roots: ['data/evaluation/IIIT5k_3000',
+ 'data/evaluation/SVT',
+ 'data/evaluation/SVTP',
+ 'data/evaluation/IC13_857',
+ 'data/evaluation/IC15_1811',
+ 'data/evaluation/CUTE80'],
+ batch_size: 384
+ }
+ data_aug: True
+ multiscales: False
+ num_workers: 14
+
+training:
+ epochs: 8
+ show_iters: 50
+ eval_iters: 3000
+ save_iters: 3000
+
+optimizer:
+ type: Adam
+ true_wd: False
+ wd: 0.0
+ bn_wd: False
+ clip_grad: 20
+ lr: 0.0001
+ args: {
+ betas: !!python/tuple [0.9, 0.999], # for default Adam
+ }
+ scheduler: {
+ periods: [6, 2],
+ gamma: 0.1,
+ }
+
+model:
+ name: 'abinet.modules.model_vision.BaseVision'
+ checkpoint: ~
+ vision: {
+ loss_weight: 1.,
+ attention: 'attention',
+ backbone: 'transformer',
+ backbone_ln: 2,
+ }
diff --git a/src/model/TextGen/src/module/abinet/configs/template.yaml b/src/model/TextGen/src/module/abinet/configs/template.yaml
new file mode 100644
index 0000000..45f8287
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/configs/template.yaml
@@ -0,0 +1,67 @@
+global:
+ name: exp
+ phase: train
+ stage: pretrain-vision
+ workdir: /tmp/workdir
+ seed: ~
+
+dataset:
+ train: {
+ roots: ['data/training/MJ/MJ_train/',
+ 'data/training/MJ/MJ_test/',
+ 'data/training/MJ/MJ_valid/',
+ 'data/training/ST'],
+ batch_size: 128
+ }
+ test: {
+ roots: ['data/evaluation/IIIT5k_3000',
+ 'data/evaluation/SVT',
+ 'data/evaluation/SVTP',
+ 'data/evaluation/IC13_857',
+ 'data/evaluation/IC15_1811',
+ 'data/evaluation/CUTE80'],
+ batch_size: 128
+ }
+ charset_path: abinet/data/charset_36.txt
+ num_workers: 4
+ max_length: 25 # 30
+ image_height: 32
+ image_width: 128
+ case_sensitive: False
+ eval_case_sensitive: False
+ data_aug: True
+ multiscales: False
+ pin_memory: True
+ smooth_label: False
+ smooth_factor: 0.1
+ one_hot_y: True
+ use_sm: False
+
+training:
+ epochs: 6
+ show_iters: 50
+ eval_iters: 3000
+ save_iters: 20000
+ start_iters: 0
+ stats_iters: 100000
+
+optimizer:
+ type: Adadelta # Adadelta, Adam
+ true_wd: False
+ wd: 0. # 0.001
+ bn_wd: False
+ args: {
+ # betas: !!python/tuple [0.9, 0.99], # betas=(0.9,0.99) for AdamW
+ # betas: !!python/tuple [0.9, 0.999], # for default Adam
+ }
+ clip_grad: 20
+ lr: [1.0, 1.0, 1.0] # lr: [0.005, 0.005, 0.005]
+ scheduler: {
+ periods: [3, 2, 1],
+ gamma: 0.1,
+ }
+
+model:
+ name: 'abinet.modules.model_abinet.ABINetModel'
+ checkpoint: ~
+ strict: True
diff --git a/src/model/TextGen/src/module/abinet/configs/train_abinet.yaml b/src/model/TextGen/src/module/abinet/configs/train_abinet.yaml
new file mode 100644
index 0000000..a614306
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/configs/train_abinet.yaml
@@ -0,0 +1,35 @@
+global:
+ name: train-abinet
+ phase: train
+ stage: train-super
+ workdir: workdir
+ seed: ~
+
+model:
+ name: 'abinet.modules.model_abinet_iter.ABINetIterModel'
+ max_length: ${dataset.max_length}
+ charset_path: ${dataset.charset_path}
+ iter_size: 3
+ ensemble: ''
+ use_vision: False
+ vision:
+ checkpoint: checkpoints/abinet/pretrain-vision-model/best-pretrain-vision-model.pth
+ loss_weight: 1.
+ attention: 'position'
+ backbone: 'transformer'
+ backbone_ln: 3
+ max_length: ${model.max_length}
+ charset_path: ${model.charset_path}
+ language:
+ checkpoint: checkpoints/abinet/pretrain-language-model/pretrain-language-model.pth
+ num_layers: 4
+ loss_weight: 1.
+ detach: True
+ use_self_attn: False
+ max_length: ${model.max_length}
+ charset_path: ${model.charset_path}
+ alignment:
+ loss_weight: 1.
+ max_length: ${model.max_length}
+ charset_path: ${model.charset_path}
+
diff --git a/src/model/TextGen/src/module/abinet/configs/train_abinet_sv.yaml b/src/model/TextGen/src/module/abinet/configs/train_abinet_sv.yaml
new file mode 100644
index 0000000..63209ea
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/configs/train_abinet_sv.yaml
@@ -0,0 +1,71 @@
+global:
+ name: train-abinet-sv
+ phase: train
+ stage: train-super
+ workdir: workdir
+ seed: ~
+
+dataset:
+ train: {
+ roots: ['data/training/MJ/MJ_train/',
+ 'data/training/MJ/MJ_test/',
+ 'data/training/MJ/MJ_valid/',
+ 'data/training/ST'],
+ batch_size: 384
+ }
+ test: {
+ roots: ['data/evaluation/IIIT5k_3000',
+ 'data/evaluation/SVT',
+ 'data/evaluation/SVTP',
+ 'data/evaluation/IC13_857',
+ 'data/evaluation/IC15_1811',
+ 'data/evaluation/CUTE80'],
+ batch_size: 384
+ }
+ data_aug: True
+ multiscales: False
+ num_workers: 14
+
+training:
+ epochs: 10
+ show_iters: 50
+ eval_iters: 3000
+ save_iters: 3000
+
+optimizer:
+ type: Adam
+ true_wd: False
+ wd: 0.0
+ bn_wd: False
+ clip_grad: 20
+ lr: 0.0001
+ args: {
+ betas: !!python/tuple [0.9, 0.999], # for default Adam
+ }
+ scheduler: {
+ periods: [6, 4],
+ gamma: 0.1,
+ }
+
+model:
+ name: 'abinet.modules.model_abinet_iter.ABINetIterModel'
+ iter_size: 3
+ ensemble: ''
+ use_vision: False
+ vision: {
+ checkpoint: workdir/pretrain-vision-model-sv/best-pretrain-vision-model-sv.pth,
+ loss_weight: 1.,
+ attention: 'attention',
+ backbone: 'transformer',
+ backbone_ln: 2,
+ }
+ language: {
+ checkpoint: workdir/pretrain-language-model/pretrain-language-model.pth,
+ num_layers: 4,
+ loss_weight: 1.,
+ detach: True,
+ use_self_attn: False
+ }
+ alignment: {
+ loss_weight: 1.,
+ }
diff --git a/src/model/TextGen/src/module/abinet/configs/train_abinet_wo_iter.yaml b/src/model/TextGen/src/module/abinet/configs/train_abinet_wo_iter.yaml
new file mode 100644
index 0000000..d3a5b81
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/configs/train_abinet_wo_iter.yaml
@@ -0,0 +1,68 @@
+global:
+ name: train-abinet-wo-iter
+ phase: train
+ stage: train-super
+ workdir: workdir
+ seed: ~
+
+dataset:
+ train: {
+ roots: ['data/training/MJ/MJ_train/',
+ 'data/training/MJ/MJ_test/',
+ 'data/training/MJ/MJ_valid/',
+ 'data/training/ST'],
+ batch_size: 384
+ }
+ test: {
+ roots: ['data/evaluation/IIIT5k_3000',
+ 'data/evaluation/SVT',
+ 'data/evaluation/SVTP',
+ 'data/evaluation/IC13_857',
+ 'data/evaluation/IC15_1811',
+ 'data/evaluation/CUTE80'],
+ batch_size: 384
+ }
+ data_aug: True
+ multiscales: False
+ num_workers: 14
+
+training:
+ epochs: 10
+ show_iters: 50
+ eval_iters: 3000
+ save_iters: 3000
+
+optimizer:
+ type: Adam
+ true_wd: False
+ wd: 0.0
+ bn_wd: False
+ clip_grad: 20
+ lr: 0.0001
+ args: {
+ betas: !!python/tuple [0.9, 0.999], # for default Adam
+ }
+ scheduler: {
+ periods: [6, 4],
+ gamma: 0.1,
+ }
+
+model:
+ name: 'abinet.modules.model_abinet.ABINetModel'
+ vision: {
+ checkpoint: workdir/pretrain-vision-model/best-pretrain-vision-model.pth,
+ loss_weight: 1.,
+ attention: 'position',
+ backbone: 'transformer',
+ backbone_ln: 3,
+ }
+ language: {
+ checkpoint: workdir/pretrain-language-model/pretrain-language-model.pth,
+ num_layers: 4,
+ loss_weight: 1.,
+ detach: True,
+ use_self_attn: False
+ }
+ alignment: {
+ loss_weight: 1.,
+ }
diff --git a/src/model/TextGen/src/module/abinet/configs/train_contrast_abinet.yaml b/src/model/TextGen/src/module/abinet/configs/train_contrast_abinet.yaml
new file mode 100644
index 0000000..e398c31
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/configs/train_contrast_abinet.yaml
@@ -0,0 +1,103 @@
+lightning:
+ logger:
+ callbacks: {}
+ modelcheckpoint:
+ monitor: "val/loss"
+ trainer:
+ benchmark: true
+
+trainer:
+ accelerator: gpu
+ devices: [6, ]
+ strategy: ddp
+ amp_backend: native
+ log_every_n_steps: 50 # this is global step
+ precision: 16
+ max_epochs: 10
+ check_val_every_n_epoch: 1
+ accumulate_grad_batches: 1
+
+
+model_eval: alignment
+model:
+ name: 'abinet.modules.model_abinet_iter.ContrastABINetIterModel'
+ iter_size: 3
+ ensemble: ''
+ use_vision: False
+ max_length: ${dataset.max_length}
+ charset_path: ${dataset.charset_path}
+
+ source: "raw"
+ base_learning_rate: 1e-3
+ precision: ${trainer.precision}
+ weight_decay: 0.0
+ adam_epsilon: 1.0e-8
+
+ vision:
+ checkpoint: checkpoints/abinet/pretrain-vision-model/best-pretrain-vision-model.pth
+ loss_weight: 1.
+ attention: 'position'
+ backbone: 'transformer'
+ backbone_ln: 3
+ d_model: 512
+ charset_path: ${model.charset_path}
+
+ class_num_heads: 8
+ contrast_hidden_dim: 512
+ max_length: ${model.max_length}
+
+ language:
+ checkpoint: checkpoints/abinet/pretrain-language-model/pretrain-language-model.pth
+ num_layers: 4
+ loss_weight: 1.
+ detach: True
+ use_self_attn: False
+ max_length: ${model.max_length}
+ charset_path: ${model.charset_path}
+
+ alignment:
+ loss_weight: 1.
+ max_length: ${model.max_length}
+ charset_path: ${model.charset_path}
+
+ char_tokenizer:
+ pretrained_path: "checkpoint/abinet/chartokenizer"
+ pad_token: "\u2591"
+ unk_token: "\u2591"
+ cls_token: "[bos]"
+
+ char_embedder:
+ vocab_size: 95 # by default
+ embedding_dim: 32
+ padding_idx: 0
+ attention_head_dim: 2
+ encoder:
+ contrast_hidden_dim: 512
+ num_heads: 8
+ num_encoder_layers: 2
+
+data:
+ batch_size: 128
+ base_dir: dataset/ocr-dataset/SynthText/data_dir/
+ train:
+ target: "train_abinet.ContrastOCRData"
+ params:
+ data_csv: ${data.base_dir}/expand_train.csv
+ img_dir: ${data.base_dir}
+ is_raw_synth: True
+ width: 128 # same as ABINet
+ height: 32
+ multiscale: True
+ training: False
+
+ validation:
+ target: "train_abinet.ContrastOCRData"
+ params:
+ data_csv: ${data.base_dir}/expand_val.csv
+ img_dir: ${data.base_dir}
+ is_raw_synth: True
+ width: 128
+ height: 32
+ multiscale: False
+ training: False
+
\ No newline at end of file
diff --git a/src/model/TextGen/src/module/abinet/modules/__init__.py b/src/model/TextGen/src/module/abinet/modules/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/model/TextGen/src/module/abinet/modules/attention.py b/src/model/TextGen/src/module/abinet/modules/attention.py
new file mode 100644
index 0000000..6b70138
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/modules/attention.py
@@ -0,0 +1,97 @@
+import torch
+import torch.nn as nn
+from .transformer import PositionalEncoding
+
+class Attention(nn.Module):
+ def __init__(self, in_channels=512, max_length=25, n_feature=256):
+ super().__init__()
+ self.max_length = max_length
+
+ self.f0_embedding = nn.Embedding(max_length, in_channels)
+ self.w0 = nn.Linear(max_length, n_feature)
+ self.wv = nn.Linear(in_channels, in_channels)
+ self.we = nn.Linear(in_channels, max_length)
+
+ self.active = nn.Tanh()
+ self.softmax = nn.Softmax(dim=2)
+
+ def forward(self, enc_output):
+ enc_output = enc_output.permute(0, 2, 3, 1).flatten(1, 2)
+ reading_order = torch.arange(self.max_length, dtype=torch.long, device=enc_output.device)
+ reading_order = reading_order.unsqueeze(0).expand(enc_output.size(0), -1) # (S,) -> (B, S)
+ reading_order_embed = self.f0_embedding(reading_order) # b,25,512
+
+ t = self.w0(reading_order_embed.permute(0, 2, 1)) # b,512,256
+ t = self.active(t.permute(0, 2, 1) + self.wv(enc_output)) # b,256,512
+
+ attn = self.we(t) # b,256,25
+ attn = self.softmax(attn.permute(0, 2, 1)) # b,25,256
+ g_output = torch.bmm(attn, enc_output) # b,25,512
+ return g_output, attn.view(*attn.shape[:2], 8, 32)
+
+
+def encoder_layer(in_c, out_c, k=3, s=2, p=1):
+ return nn.Sequential(nn.Conv2d(in_c, out_c, k, s, p),
+ nn.BatchNorm2d(out_c),
+ nn.ReLU(True))
+
+def decoder_layer(in_c, out_c, k=3, s=1, p=1, mode='nearest', scale_factor=None, size=None):
+ align_corners = None if mode=='nearest' else True
+ return nn.Sequential(nn.Upsample(size=size, scale_factor=scale_factor,
+ mode=mode, align_corners=align_corners),
+ nn.Conv2d(in_c, out_c, k, s, p),
+ nn.BatchNorm2d(out_c),
+ nn.ReLU(True))
+
+
+class PositionAttention(nn.Module):
+ def __init__(self, max_length, in_channels=512, num_channels=64,
+ h=8, w=32, mode='nearest', **kwargs):
+ super().__init__()
+ self.max_length = max_length
+ self.k_encoder = nn.Sequential(
+ encoder_layer(in_channels, num_channels, s=(1, 2)),
+ encoder_layer(num_channels, num_channels, s=(2, 2)),
+ encoder_layer(num_channels, num_channels, s=(2, 2)),
+ encoder_layer(num_channels, num_channels, s=(2, 2))
+ )
+ self.k_decoder = nn.Sequential(
+ decoder_layer(num_channels, num_channels, scale_factor=2, mode=mode),
+ decoder_layer(num_channels, num_channels, scale_factor=2, mode=mode),
+ decoder_layer(num_channels, num_channels, scale_factor=2, mode=mode),
+ decoder_layer(num_channels, in_channels, size=(h, w), mode=mode)
+ )
+
+ self.pos_encoder = PositionalEncoding(in_channels, dropout=0, max_len=max_length)
+ self.project = nn.Linear(in_channels, in_channels)
+
+ def forward(self, x):
+ N, E, H, W = x.size()
+ k, v = x, x # (N, E, H, W)
+
+ # calculate key vector
+ features = []
+ for i in range(0, len(self.k_encoder)):
+ k = self.k_encoder[i](k)
+ features.append(k)
+ for i in range(0, len(self.k_decoder) - 1):
+ k = self.k_decoder[i](k)
+ k = k + features[len(self.k_decoder) - 2 - i]
+ k = self.k_decoder[-1](k)
+
+ # calculate query vector
+ # TODO q=f(q,k)
+ zeros = x.new_zeros((self.max_length, N, E)) # (T, N, E)
+ q = self.pos_encoder(zeros) # (T, N, E)
+ q = q.permute(1, 0, 2) # (N, T, E)
+ q = self.project(q) # (N, T, E)
+
+ # calculate attention
+ attn_scores = torch.bmm(q, k.flatten(2, 3)) # (N, T, (H*W))
+ attn_scores = attn_scores / (E ** 0.5)
+ attn_scores = torch.softmax(attn_scores, dim=-1)
+
+ v = v.permute(0, 2, 3, 1).view(N, -1, E) # (N, (H*W), E)
+ attn_vecs = torch.bmm(attn_scores, v) # (N, T, E)
+
+ return attn_vecs, attn_scores.view(N, -1, H, W)
\ No newline at end of file
diff --git a/src/model/TextGen/src/module/abinet/modules/backbone.py b/src/model/TextGen/src/module/abinet/modules/backbone.py
new file mode 100644
index 0000000..b07d384
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/modules/backbone.py
@@ -0,0 +1,35 @@
+# from fastai.vision import *
+
+from .model import _default_tfmer_cfg
+from .resnet import resnet45
+from .transformer import (PositionalEncoding,
+ TransformerEncoder,
+ TransformerEncoderLayer)
+from .module_util import *
+
+class ResTranformer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.resnet = resnet45()
+
+ self.d_model = config.get("d_model", _default_tfmer_cfg["d_model"])
+ # self.d_model = ifnone(config.model_vision_d_model, _default_tfmer_cfg['d_model'])
+ nhead = config.get("nhead", _default_tfmer_cfg["nhead"])
+ d_inner = config.get("d_inner", _default_tfmer_cfg["d_inner"])
+ dropout = config.get("dropout", _default_tfmer_cfg["dropout"])
+ activation = config.get("activation", _default_tfmer_cfg["activation"])
+ num_layers = config.get("backbone_ln", 2)
+
+ self.pos_encoder = PositionalEncoding(self.d_model, max_len=8*32)
+ encoder_layer = TransformerEncoderLayer(d_model=self.d_model, nhead=nhead,
+ dim_feedforward=d_inner, dropout=dropout, activation=activation)
+ self.transformer = TransformerEncoder(encoder_layer, num_layers)
+
+ def forward(self, images):
+ feature = self.resnet(images)
+ n, c, h, w = feature.shape
+ feature = feature.view(n, c, -1).permute(2, 0, 1)
+ feature = self.pos_encoder(feature)
+ feature = self.transformer(feature)
+ feature = feature.permute(1, 2, 0).view(n, c, h, w)
+ return feature
diff --git a/src/model/TextGen/src/module/abinet/modules/backbone_v2.py b/src/model/TextGen/src/module/abinet/modules/backbone_v2.py
new file mode 100644
index 0000000..a84f205
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/modules/backbone_v2.py
@@ -0,0 +1,4 @@
+""" Created by MrBBS """
+# 10/11/2022
+# -*-encoding:utf-8-*-
+
diff --git a/src/model/TextGen/src/module/abinet/modules/losses.py b/src/model/TextGen/src/module/abinet/modules/losses.py
new file mode 100644
index 0000000..8606f2a
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/modules/losses.py
@@ -0,0 +1,73 @@
+# from fastai.vision import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class MultiLosses(nn.Module):
+ def __init__(self, one_hot=True):
+ super().__init__()
+ self.ce = SoftCrossEntropyLoss() if one_hot else torch.nn.CrossEntropyLoss()
+ self.bce = torch.nn.BCELoss()
+
+ @property
+ def last_losses(self):
+ return self.losses
+
+ def _flatten(self, sources, lengths):
+ return torch.cat([t[:l] for t, l in zip(sources, lengths)])
+
+ def _merge_list(self, all_res):
+ if not isinstance(all_res, (list, tuple)):
+ return all_res
+ def merge(items):
+ if isinstance(items[0], torch.Tensor): return torch.cat(items, dim=0)
+ else: return items[0]
+ res = dict()
+ for key in all_res[0].keys():
+ items = [r[key] for r in all_res]
+ res[key] = merge(items)
+ return res
+
+ def _ce_loss(self, output, gt_labels, gt_lengths, idx=None, record=True):
+ loss_name = output.get('name')
+ pt_logits, weight = output['logits'], output['loss_weight']
+
+ assert pt_logits.shape[0] % gt_labels.shape[0] == 0
+ iter_size = pt_logits.shape[0] // gt_labels.shape[0]
+ if iter_size > 1:
+ gt_labels = gt_labels.repeat(3, 1, 1)
+ gt_lengths = gt_lengths.repeat(3)
+ flat_gt_labels = self._flatten(gt_labels, gt_lengths)
+ flat_pt_logits = self._flatten(pt_logits, gt_lengths)
+
+ nll = output.get('nll')
+ if nll is not None:
+ loss = self.ce(flat_pt_logits, flat_gt_labels, softmax=False) * weight
+ else:
+ loss = self.ce(flat_pt_logits, flat_gt_labels) * weight
+ if record and loss_name is not None: self.losses[f'{loss_name}_loss'] = loss
+
+ return loss
+
+ def forward(self, outputs, *args):
+ self.losses = {}
+ if isinstance(outputs, (tuple, list)):
+ outputs = [self._merge_list(o) for o in outputs]
+ # return sum([self._ce_loss(o, *args) for o in outputs if o['loss_weight'] > 0.])
+ return torch.mean(torch.stack([self._ce_loss(o, *args) for o in outputs if o['loss_weight'] > 0.]))
+ else:
+ return self._ce_loss(outputs, *args, record=False)
+
+
+class SoftCrossEntropyLoss(nn.Module):
+ def __init__(self, reduction="mean"):
+ super().__init__()
+ self.reduction = reduction
+
+ def forward(self, input, target, softmax=True):
+ if softmax: log_prob = F.log_softmax(input, dim=-1)
+ else: log_prob = torch.log(input)
+ loss = -(target * log_prob).sum(dim=-1)
+ if self.reduction == "mean": return loss.mean()
+ elif self.reduction == "sum": return loss.sum()
+ else: return loss
diff --git a/src/model/TextGen/src/module/abinet/modules/model.py b/src/model/TextGen/src/module/abinet/modules/model.py
new file mode 100644
index 0000000..b308c86
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/modules/model.py
@@ -0,0 +1,59 @@
+import torch
+import torch.nn as nn
+import numpy as np
+
+from ..utils import CharsetMapper
+
+_default_tfmer_cfg = dict(d_model=512, nhead=8, d_inner=2048, # 1024
+ dropout=0.1, activation='relu')
+
+
+class Model(nn.Module):
+
+ def __init__(self, config):
+ super().__init__()
+ self.max_length = config.max_length + 1
+ self.charset = CharsetMapper(config.charset_path,
+ max_length=self.max_length)
+
+ def load(self, source, device=None, strict=True):
+ state = torch.load(source, map_location=device)
+ self.load_state_dict(state['model'], strict=strict)
+
+ def _get_length(self, logit):
+ """ Greed decoder to obtain length from logit"""
+ out = (logit.argmax(dim=-1) == self.charset.null_label)
+ out = self.first_nonzero(out.int()) + 1
+ return out
+
+ @staticmethod
+ def first_nonzero(x):
+ non_zero_mask = x != 0
+ mask_max_values, mask_max_indices = torch.max(
+ non_zero_mask.int(), dim=-1)
+ mask_max_indices[mask_max_values == 0] = -1
+ return mask_max_indices
+
+ @staticmethod
+ def _get_padding_mask(length, max_length):
+ length = length.unsqueeze(-1)
+ grid = torch.arange(0, max_length, device=length.device).unsqueeze(0)
+ return grid >= length
+
+ @staticmethod
+ def _get_square_subsequent_mask(sz, device, diagonal=0, fw=True):
+ r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
+ Unmasked positions are filled with float(0.0).
+ """
+ mask = (torch.triu(torch.ones(sz, sz, device=device), diagonal=diagonal) == 1)
+ if fw:
+ mask = mask.transpose(0, 1)
+ mask = mask.float().masked_fill(mask == 0, float(
+ '-inf')).masked_fill(mask == 1, float(0.0))
+ return mask
+
+ @staticmethod
+ def _get_location_mask(sz, device=None):
+ mask = torch.eye(sz, device=device)
+ mask = mask.float().masked_fill(mask == 1, float('-inf'))
+ return mask
diff --git a/src/model/TextGen/src/module/abinet/modules/model_abinet.py b/src/model/TextGen/src/module/abinet/modules/model_abinet.py
new file mode 100644
index 0000000..2c9bfb2
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/modules/model_abinet.py
@@ -0,0 +1,32 @@
+# from fastai.vision import *
+
+import torch
+import torch.nn as nn
+
+from .model_alignment import BaseAlignment
+from .model_language import BCNLanguage
+from .model_vision import BaseVision, ContrastVision
+
+from .module_util import *
+
+class ABINetModel(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.use_alignment = ifnone(config.model_use_alignment, True)
+ self.max_length = config.dataset_max_length + 1 # additional stop token
+ self.vision = BaseVision(config)
+ self.language = BCNLanguage(config)
+ if self.use_alignment: self.alignment = BaseAlignment(config)
+
+ def forward(self, images, *args):
+ v_res = self.vision(images)
+ v_tokens = torch.softmax(v_res['logits'], dim=-1)
+ v_lengths = v_res['pt_lengths'].clamp_(2, self.max_length) # TODO:move to langauge model
+
+ l_res = self.language(v_tokens, v_lengths)
+ if not self.use_alignment:
+ return l_res, v_res
+ l_feature, v_feature = l_res['feature'], v_res['feature']
+
+ a_res = self.alignment(l_feature, v_feature)
+ return a_res, l_res, v_res
diff --git a/src/model/TextGen/src/module/abinet/modules/model_abinet_iter.py b/src/model/TextGen/src/module/abinet/modules/model_abinet_iter.py
new file mode 100644
index 0000000..69af4ab
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/modules/model_abinet_iter.py
@@ -0,0 +1,91 @@
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as vision_F
+
+from .model_vision import BaseVision, ContrastVision
+from .model_language import BCNLanguage
+from .model_alignment import BaseAlignment
+from .module_util import *
+from .losses import MultiLosses
+from typing import Tuple
+
+
+class ABINetIterModel(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.iter_size = config.iter_size
+ self.max_length = config.max_length + 1 # additional stop token
+ self.vision = BaseVision(config.vision)
+ self.language = BCNLanguage(config.language)
+ self.alignment = BaseAlignment(config.alignment)
+ self.export = config.get("export", False)
+
+ def forward(self, images, mode="train", *args):
+ v_res = self.vision(images)
+ a_res = v_res
+ all_l_res, all_a_res = [], []
+ for _ in range(self.iter_size):
+ tokens = torch.softmax(a_res["logits"], dim=-1)
+ lengths = a_res["pt_lengths"]
+ lengths.clamp_(2, self.max_length) # TODO:move to langauge model
+ l_res = self.language(tokens, lengths)
+ all_l_res.append(l_res)
+ a_res = self.alignment(l_res["feature"], v_res["feature"])
+ all_a_res.append(a_res)
+ if self.export:
+ return F.softmax(a_res["logits"], dim=2), a_res["pt_lengths"]
+ if mode == "train":
+ return all_a_res, all_l_res, v_res
+ elif mode == "validation":
+ return all_a_res, all_l_res, v_res, (a_res, all_l_res[-1], v_res)
+ else:
+ return a_res, all_l_res[-1], v_res
+
+
+class ABINetIterModelWrapper(nn.Module):
+ # wrapper for ABINetIterModel to make loss_computation in this
+ def __init__(self, config, width, height) -> None:
+ super().__init__()
+ # TODO: accomodate ContrastABINetIterModel
+ self.abinet = ABINetIterModel(config)
+ self.width = width
+ self.height = height
+ self.loss_fn = MultiLosses(True)
+
+ def preprocess_char(self, char_tokenizer, labels, device):
+ # convert label strings to char_input_ids, one_hot_label for loss computation
+ inputs = char_tokenizer(
+ labels,
+ return_tensors="pt",
+ padding="max_length",
+ truncation=True,
+ add_special_tokens=True,
+ )
+ char_input_ids = inputs["input_ids"]
+ abi_num_classes = len(char_tokenizer) - 1
+ abi_labels = char_input_ids[:, 1:]
+ gt_labels = F.one_hot(abi_labels, abi_num_classes)
+ gt_lengths = torch.sum(inputs.attention_mask, dim=1)
+ return char_input_ids.to(device), gt_labels.to(device), gt_lengths.to(device)
+
+ def preprocess(self, image):
+ # images: (C, H, W)
+ # this method resize images to self.w self.h
+ return vision_F.resize(image, (self.height, self.width))
+
+ def forward(self, images, char_inputs: Tuple, mode="train", *args):
+ char_input_ids, gt_labels, gt_lengths = char_inputs
+ assert images.device == char_input_ids.device == gt_labels.device
+ outputs = self.abinet(images, char_input_ids, mode)
+ celoss_inputs = outputs[:3]
+ # TODO: add contrast loss later
+ celoss = self.loss_fn(celoss_inputs, gt_labels, gt_lengths)
+ if mode == "train":
+ return celoss
+ elif mode == "test" or mode == "validation":
+ #! TODO: not compatible with tokenizer
+ text_preds = outputs[-1]
+ pt_text, a, b = postprocess(
+ text_preds,
+ )
+ return celoss, outputs[-1]
diff --git a/src/model/TextGen/src/module/abinet/modules/model_alignment.py b/src/model/TextGen/src/module/abinet/modules/model_alignment.py
new file mode 100644
index 0000000..b2693f5
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/modules/model_alignment.py
@@ -0,0 +1,33 @@
+import torch
+import torch.nn as nn
+
+from .model import Model, _default_tfmer_cfg
+from .module_util import *
+
+
+class BaseAlignment(Model):
+ def __init__(self, config):
+ super().__init__(config)
+ d_model = config.get("d_model", _default_tfmer_cfg["d_model"])
+
+ self.loss_weight = config.get("loss_weight", 1.0)
+ self.w_att = nn.Linear(2 * d_model, d_model)
+ self.cls = nn.Linear(d_model, self.charset.num_classes)
+
+ def forward(self, l_feature, v_feature):
+ """
+ Args:
+ l_feature: (N, T, E) where T is length, N is batch size and d is dim of model
+ v_feature: (N, T, E) shape the same as l_feature
+ l_lengths: (N,)
+ v_lengths: (N,)
+ """
+ f = torch.cat((l_feature, v_feature), dim=2)
+ f_att = torch.sigmoid(self.w_att(f))
+ output = f_att * v_feature + (1 - f_att) * l_feature
+
+ logits = self.cls(output) # (N, T, C)
+ pt_lengths = self._get_length(logits)
+
+ return {'logits': logits, 'pt_lengths': pt_lengths, 'loss_weight':self.loss_weight,
+ 'name': 'alignment'}
diff --git a/src/model/TextGen/src/module/abinet/modules/model_language.py b/src/model/TextGen/src/module/abinet/modules/model_language.py
new file mode 100644
index 0000000..a1f9ab5
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/modules/model_language.py
@@ -0,0 +1,67 @@
+import logging
+import torch.nn as nn
+
+from .model import _default_tfmer_cfg
+from .model import Model
+from .transformer import (PositionalEncoding,
+ TransformerDecoder,
+ TransformerDecoderLayer)
+
+from .module_util import *
+
+class BCNLanguage(Model):
+ def __init__(self, config):
+ super().__init__(config)
+ d_model = config.get("d_model", _default_tfmer_cfg['d_model'])
+ nhead = config.get("nhead", _default_tfmer_cfg['nhead'])
+ d_inner = config.get("d_inner", _default_tfmer_cfg['d_inner'])
+ dropout = config.get("dropout", _default_tfmer_cfg['dropout'])
+ activation = config.get("activation", _default_tfmer_cfg['activation'])
+ num_layers = config.get("num_layers", 4)
+
+ self.d_model = d_model
+ self.detach = config.get("detach", True)
+ self.use_self_attn = config.get("use_self_attn", False)
+ self.loss_weight = config.get("loss_weight", 1.0)
+ # self.max_length = self.max_length
+
+ self.proj = nn.Linear(self.charset.num_classes, d_model, False)
+ self.token_encoder = PositionalEncoding(d_model, max_len=self.max_length)
+ self.pos_encoder = PositionalEncoding(d_model, dropout=0, max_len=self.max_length)
+ decoder_layer = TransformerDecoderLayer(d_model, nhead, d_inner, dropout,
+ activation, self_attn=self.use_self_attn)
+ self.model = TransformerDecoder(decoder_layer, num_layers)
+
+ self.cls = nn.Linear(d_model, self.charset.num_classes)
+
+ if config.checkpoint is not None:
+ logging.info(f'Read language model from {config.checkpoint}.')
+ self.load(config.checkpoint, device="cpu")
+
+ def forward(self, tokens, lengths):
+ """
+ Args:
+ tokens: (N, T, C) where T is length, N is batch size and C is classes number
+ lengths: (N,)
+ """
+ if self.detach: tokens = tokens.detach()
+ embed = self.proj(tokens) # (N, T, E)
+ embed = embed.permute(1, 0, 2) # (T, N, E)
+ embed = self.token_encoder(embed) # (T, N, E)
+ padding_mask = self._get_padding_mask(lengths, self.max_length)
+
+ zeros = embed.new_zeros(*embed.shape)
+ qeury = self.pos_encoder(zeros)
+ location_mask = self._get_location_mask(self.max_length, tokens.device)
+ output = self.model(qeury, embed,
+ tgt_key_padding_mask=padding_mask,
+ memory_mask=location_mask,
+ memory_key_padding_mask=padding_mask) # (T, N, E)
+ output = output.permute(1, 0, 2) # (N, T, E)
+
+ logits = self.cls(output) # (N, T, C)
+ pt_lengths = self._get_length(logits)
+
+ res = {'feature': output, 'logits': logits, 'pt_lengths': pt_lengths,
+ 'loss_weight':self.loss_weight, 'name': 'language'}
+ return res
diff --git a/src/model/TextGen/src/module/abinet/modules/model_vision.py b/src/model/TextGen/src/module/abinet/modules/model_vision.py
new file mode 100644
index 0000000..a415816
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/modules/model_vision.py
@@ -0,0 +1,88 @@
+import logging
+import torch.nn as nn
+
+from .attention import *
+from .backbone import ResTranformer
+from .model import Model
+from .resnet import resnet45
+
+from .module_util import *
+
+class BaseVision(Model):
+ def __init__(self, config):
+ super().__init__(config)
+ self.loss_weight = config.get("loss_weight", 1.0)
+ self.out_channels = config.get("d_model", 512)
+
+ if config.backbone == 'transformer':
+ self.backbone = ResTranformer(config)
+ else:
+ self.backbone = resnet45()
+
+ if config.attention == 'position':
+ mode = config.get("attention_mode", 'nearest')
+ self.attention = PositionAttention(
+ max_length=config.max_length + 1, # additional stop token
+ mode=mode,
+ )
+ elif config.attention == 'attention':
+ self.attention = Attention(
+ max_length=config.max_length + 1, # additional stop token
+ n_feature=8*32,
+ )
+ else:
+ raise Exception(f'{config.attention} is not valid.')
+ self.cls = nn.Linear(self.out_channels, self.charset.num_classes)
+
+ if config.checkpoint is not None:
+ logging.info(f'Read vision model from {config.checkpoint}.')
+ self.load(config.checkpoint, device="cpu") # always cpu first and then convert
+
+ def forward(self, images, *args):
+ features = self.backbone(images) # (N, E, H, W)
+ attn_vecs, attn_scores = self.attention(features) # (N, T, E), (N, T, H, W)
+ logits = self.cls(attn_vecs) # (N, T, C)
+ pt_lengths = self._get_length(logits)
+
+ return {'feature': attn_vecs, 'logits': logits, 'pt_lengths': pt_lengths,
+ 'attn_scores': attn_scores, 'loss_weight':self.loss_weight, 'name': 'vision'}
+
+class ContrastVision(BaseVision):
+ def __init__(self, config):
+ assert config.attention == 'position', "Contrastive learning only supports position attention in this model."
+ super().__init__(config) # backbone is not changed
+
+ # gather the information from attn_vecs provided by features
+ self.class_embedding_q = nn.Parameter(torch.randn(self.out_channels))
+ self.class_encoder = nn.MultiheadAttention(
+ embed_dim=self.out_channels, num_heads=config.class_num_heads, batch_first=True,
+ )
+
+ def forward(self, images, *args):
+ # 1. Extract features
+ features = self.backbone(images) # (N, E, H, W)
+ attn_vecs, attn_scores = self.attention(features) # (N, T, E), (N, T, H, W)
+
+ # 2. logits as before
+ logits = self.cls(attn_vecs)
+ pt_lengths = self._get_length(logits)
+
+ # 3. Compute the class embedding for contrastive learning
+ # attn_vecs already has position information(position embedding),
+ # therefore we use attention mechanism to do weighted-sum on them
+
+ # pt_lengths contain the length of each sequence, therefore we can use it to mask the padding part
+ mask = torch.arange(attn_vecs.shape[1], device=logits.device)[None, :] < pt_lengths[:, None]
+ class_embedding_q = self.class_embedding_q.expand(attn_vecs.shape[0], 1, -1) # expand to (N, 1, E)
+ # attention weighted sum of attn_vecs, (N, 1, E)
+ class_feature, _ = self.class_encoder(
+ query=class_embedding_q,
+ key=attn_vecs,
+ value=attn_vecs,
+ key_padding_mask=mask,
+ ) # we only want the weighted value
+ class_feature = class_feature[:, 0, :] # (N, E)
+
+ return {'feature': attn_vecs, 'logits': logits,
+ 'pt_lengths': pt_lengths, 'class_feature' : class_feature,
+ 'attn_scores': attn_scores, 'loss_weight':self.loss_weight, 'name': 'vision',}
diff --git a/src/model/TextGen/src/module/abinet/modules/module_util.py b/src/model/TextGen/src/module/abinet/modules/module_util.py
new file mode 100644
index 0000000..16e48e1
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/modules/module_util.py
@@ -0,0 +1,33 @@
+import torch.nn.functional as F
+import torch.nn as nn
+from typing import Any
+def ifnone(a:Any,b:Any)->Any:
+ "`a` if `a` is not None, otherwise `b`."
+ return b if a is None else a
+
+def postprocess(output, charset, model_eval):
+ def _get_output(last_output, model_eval):
+ if isinstance(last_output, (tuple, list)):
+ for res in last_output:
+ if res['name'] == model_eval:
+ output = res
+ else:
+ output = last_output
+ return output
+
+ def _decode(logit):
+ """ Greed decode """
+ out = F.softmax(logit, dim=2)
+ pt_text, pt_scores, pt_lengths = [], [], []
+ for o in out:
+ text = charset.get_text(o.argmax(dim=1), padding=False, trim=False)
+ text = text.split(charset.null_char)[0] # end at end-token
+ pt_text.append(text)
+ pt_scores.append(o.max(dim=1)[0])
+ pt_lengths.append(min(len(text) + 1, charset.max_length)) # one for end-token
+ return pt_text, pt_scores, pt_lengths
+
+ output = _get_output(output, model_eval)
+ logits, pt_lengths = output['logits'], output['pt_lengths']
+ pt_text, pt_scores, pt_lengths_ = _decode(logits)
+ return pt_text, pt_scores, pt_lengths_
diff --git a/src/model/TextGen/src/module/abinet/modules/resnet.py b/src/model/TextGen/src/module/abinet/modules/resnet.py
new file mode 100644
index 0000000..5ffb908
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/modules/resnet.py
@@ -0,0 +1,104 @@
+import math
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+ return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+ "3x3 convolution with padding"
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+ padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+ expansion = 1
+
+ def __init__(self, inplanes, planes, stride=1, downsample=None):
+ super(BasicBlock, self).__init__()
+ self.conv1 = conv1x1(inplanes, planes)
+ self.bn1 = nn.BatchNorm2d(planes)
+ self.relu = nn.ReLU(inplace=True)
+ self.conv2 = conv3x3(planes, planes, stride)
+ self.bn2 = nn.BatchNorm2d(planes)
+ self.downsample = downsample
+ self.stride = stride
+
+ def forward(self, x):
+ residual = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+
+ if self.downsample is not None:
+ residual = self.downsample(x)
+
+ out += residual
+ out = self.relu(out)
+
+ return out
+
+
+class ResNet(nn.Module):
+
+ def __init__(self, block, layers):
+ self.inplanes = 32
+ super(ResNet, self).__init__()
+ self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1,
+ bias=False)
+ self.bn1 = nn.BatchNorm2d(32)
+ self.relu = nn.ReLU(inplace=True)
+
+ self.layer1 = self._make_layer(block, 32, layers[0], stride=2)
+ self.layer2 = self._make_layer(block, 64, layers[1], stride=1)
+ self.layer3 = self._make_layer(block, 128, layers[2], stride=2)
+ self.layer4 = self._make_layer(block, 256, layers[3], stride=1)
+ self.layer5 = self._make_layer(block, 512, layers[4], stride=1)
+
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+ m.weight.data.normal_(0, math.sqrt(2. / n))
+ elif isinstance(m, nn.BatchNorm2d):
+ m.weight.data.fill_(1)
+ m.bias.data.zero_()
+
+ def _make_layer(self, block, planes, blocks, stride=1):
+ downsample = None
+ if stride != 1 or self.inplanes != planes * block.expansion:
+ downsample = nn.Sequential(
+ nn.Conv2d(self.inplanes, planes * block.expansion,
+ kernel_size=1, stride=stride, bias=False),
+ nn.BatchNorm2d(planes * block.expansion),
+ )
+
+ layers = []
+ layers.append(block(self.inplanes, planes, stride, downsample))
+ self.inplanes = planes * block.expansion
+ for i in range(1, blocks):
+ layers.append(block(self.inplanes, planes))
+
+ return nn.Sequential(*layers)
+
+ def forward(self, x):
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = self.relu(x)
+ x = self.layer1(x)
+ x = self.layer2(x)
+ x = self.layer3(x)
+ x = self.layer4(x)
+ x = self.layer5(x)
+ return x
+
+
+def resnet45():
+ return ResNet(BasicBlock, [3, 4, 6, 6, 3])
diff --git a/src/model/TextGen/src/module/abinet/modules/transformer.py b/src/model/TextGen/src/module/abinet/modules/transformer.py
new file mode 100644
index 0000000..6dde312
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/modules/transformer.py
@@ -0,0 +1,901 @@
+# pytorch 1.5.0
+import copy
+import math
+import warnings
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import Dropout, LayerNorm, Linear, Module, ModuleList, Parameter
+from torch.nn import functional as F
+from torch.nn.init import constant_, xavier_uniform_
+
+
+def multi_head_attention_forward(query, # type: Tensor
+ key, # type: Tensor
+ value, # type: Tensor
+ embed_dim_to_check, # type: int
+ num_heads, # type: int
+ in_proj_weight, # type: Tensor
+ in_proj_bias, # type: Tensor
+ bias_k, # type: Optional[Tensor]
+ bias_v, # type: Optional[Tensor]
+ add_zero_attn, # type: bool
+ dropout_p, # type: float
+ out_proj_weight, # type: Tensor
+ out_proj_bias, # type: Tensor
+ training=True, # type: bool
+ key_padding_mask=None, # type: Optional[Tensor]
+ need_weights=True, # type: bool
+ attn_mask=None, # type: Optional[Tensor]
+ use_separate_proj_weight=False, # type: bool
+ q_proj_weight=None, # type: Optional[Tensor]
+ k_proj_weight=None, # type: Optional[Tensor]
+ v_proj_weight=None, # type: Optional[Tensor]
+ static_k=None, # type: Optional[Tensor]
+ static_v=None # type: Optional[Tensor]
+ ):
+ # type: (...) -> Tuple[Tensor, Optional[Tensor]]
+ r"""
+ Args:
+ query, key, value: map a query and a set of key-value pairs to an output.
+ See "Attention Is All You Need" for more details.
+ embed_dim_to_check: total dimension of the model.
+ num_heads: parallel attention heads.
+ in_proj_weight, in_proj_bias: input projection weight and bias.
+ bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+ add_zero_attn: add a new batch of zeros to the key and
+ value sequences at dim=1.
+ dropout_p: probability of an element to be zeroed.
+ out_proj_weight, out_proj_bias: the output projection weight and bias.
+ training: apply dropout if is ``True``.
+ key_padding_mask: if provided, specified padding elements in the key will
+ be ignored by the attention. This is an binary mask. When the value is True,
+ the corresponding value on the attention layer will be filled with -inf.
+ need_weights: output attn_output_weights.
+ attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+ the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+ use_separate_proj_weight: the function accept the proj. weights for query, key,
+ and value in different forms. If false, in_proj_weight will be used, which is
+ a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+ q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+ static_k, static_v: static key and value used for attention operators.
+ Shape:
+ Inputs:
+ - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+ the embedding dimension.
+ - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+ the embedding dimension.
+ - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+ the embedding dimension.
+ - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+ If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
+ will be unchanged. If a BoolTensor is provided, the positions with the
+ value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+ - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+ 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+ S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
+ positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+ while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+ are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+ is provided, it will be added to the attention weight.
+ - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+ N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+ - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+ N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+ Outputs:
+ - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+ E is the embedding dimension.
+ - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+ L is the target sequence length, S is the source sequence length.
+ """
+ # if not torch.jit.is_scripting():
+ # tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v,
+ # out_proj_weight, out_proj_bias)
+ # if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
+ # return handle_torch_function(
+ # multi_head_attention_forward, tens_ops, query, key, value,
+ # embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,
+ # bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight,
+ # out_proj_bias, training=training, key_padding_mask=key_padding_mask,
+ # need_weights=need_weights, attn_mask=attn_mask,
+ # use_separate_proj_weight=use_separate_proj_weight,
+ # q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight,
+ # v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)
+ tgt_len, bsz, embed_dim = query.size()
+ assert embed_dim == embed_dim_to_check
+ assert key.size() == value.size()
+
+ head_dim = embed_dim // num_heads
+ assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+ scaling = float(head_dim) ** -0.5
+
+ if not use_separate_proj_weight:
+ if torch.equal(query, key) and torch.equal(key, value):
+ # self-attention
+ q, k, v = F.linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
+
+ elif torch.equal(key, value):
+ # encoder-decoder attention
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
+ _b = in_proj_bias
+ _start = 0
+ _end = embed_dim
+ _w = in_proj_weight[_start:_end, :]
+ if _b is not None:
+ _b = _b[_start:_end]
+ q = F.linear(query, _w, _b)
+
+ if key is None:
+ assert value is None
+ k = None
+ v = None
+ else:
+
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
+ _b = in_proj_bias
+ _start = embed_dim
+ _end = None
+ _w = in_proj_weight[_start:, :]
+ if _b is not None:
+ _b = _b[_start:]
+ k, v = F.linear(key, _w, _b).chunk(2, dim=-1)
+
+ else:
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
+ _b = in_proj_bias
+ _start = 0
+ _end = embed_dim
+ _w = in_proj_weight[_start:_end, :]
+ if _b is not None:
+ _b = _b[_start:_end]
+ q = F.linear(query, _w, _b)
+
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
+ _b = in_proj_bias
+ _start = embed_dim
+ _end = embed_dim * 2
+ _w = in_proj_weight[_start:_end, :]
+ if _b is not None:
+ _b = _b[_start:_end]
+ k = F.linear(key, _w, _b)
+
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
+ _b = in_proj_bias
+ _start = embed_dim * 2
+ _end = None
+ _w = in_proj_weight[_start:, :]
+ if _b is not None:
+ _b = _b[_start:]
+ v = F.linear(value, _w, _b)
+ else:
+ q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
+ len1, len2 = q_proj_weight_non_opt.size()
+ assert len1 == embed_dim and len2 == query.size(-1)
+
+ k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
+ len1, len2 = k_proj_weight_non_opt.size()
+ assert len1 == embed_dim and len2 == key.size(-1)
+
+ v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
+ len1, len2 = v_proj_weight_non_opt.size()
+ assert len1 == embed_dim and len2 == value.size(-1)
+
+ if in_proj_bias is not None:
+ q = F.linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
+ k = F.linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim:(embed_dim * 2)])
+ v = F.linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):])
+ else:
+ q = F.linear(query, q_proj_weight_non_opt, in_proj_bias)
+ k = F.linear(key, k_proj_weight_non_opt, in_proj_bias)
+ v = F.linear(value, v_proj_weight_non_opt, in_proj_bias)
+ q = q * scaling
+
+ if attn_mask is not None:
+ assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
+ attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
+ 'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
+ if attn_mask.dtype == torch.uint8:
+ warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+ attn_mask = attn_mask.to(torch.bool)
+
+ if attn_mask.dim() == 2:
+ attn_mask = attn_mask.unsqueeze(0)
+ if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+ raise RuntimeError('The size of the 2D attn_mask is not correct.')
+ elif attn_mask.dim() == 3:
+ if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
+ raise RuntimeError('The size of the 3D attn_mask is not correct.')
+ else:
+ raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
+ # attn_mask's dim is 3 now.
+
+ # # convert ByteTensor key_padding_mask to bool
+ # if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+ # warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+ # key_padding_mask = key_padding_mask.to(torch.bool)
+
+ if bias_k is not None and bias_v is not None:
+ if static_k is None and static_v is None:
+ k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+ v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+ if attn_mask is not None:
+ attn_mask = pad(attn_mask, (0, 1))
+ if key_padding_mask is not None:
+ key_padding_mask = pad(key_padding_mask, (0, 1))
+ else:
+ assert static_k is None, "bias cannot be added to static key."
+ assert static_v is None, "bias cannot be added to static value."
+ else:
+ assert bias_k is None
+ assert bias_v is None
+
+ q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+ if k is not None:
+ k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+ if v is not None:
+ v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+
+ if static_k is not None:
+ assert static_k.size(0) == bsz * num_heads
+ assert static_k.size(2) == head_dim
+ k = static_k
+
+ if static_v is not None:
+ assert static_v.size(0) == bsz * num_heads
+ assert static_v.size(2) == head_dim
+ v = static_v
+
+ src_len = k.size(1)
+
+ if key_padding_mask is not None:
+ assert key_padding_mask.size(0) == bsz
+ assert key_padding_mask.size(1) == src_len
+
+ if add_zero_attn:
+ src_len += 1
+ k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
+ v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
+ if attn_mask is not None:
+ attn_mask = pad(attn_mask, (0, 1))
+ if key_padding_mask is not None:
+ key_padding_mask = pad(key_padding_mask, (0, 1))
+
+ attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+ assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+
+ if attn_mask is not None:
+ if attn_mask.dtype == torch.bool:
+ attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+ else:
+ attn_output_weights += attn_mask
+
+
+ if key_padding_mask is not None:
+ attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+ attn_output_weights = attn_output_weights.masked_fill(
+ key_padding_mask.unsqueeze(1).unsqueeze(2),
+ float('-inf'),
+ )
+ attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
+
+ attn_output_weights = F.softmax(
+ attn_output_weights, dim=-1)
+ attn_output_weights = F.dropout(attn_output_weights, p=dropout_p, training=training)
+
+ attn_output = torch.bmm(attn_output_weights, v)
+ assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
+ attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+ attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias)
+
+ if need_weights:
+ # average attention weights over heads
+ attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+ return attn_output, attn_output_weights.sum(dim=1) / num_heads
+ else:
+ return attn_output, None
+
+class MultiheadAttention(Module):
+ r"""Allows the model to jointly attend to information
+ from different representation subspaces.
+ See reference: Attention Is All You Need
+ .. math::
+ \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+ \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+ Args:
+ embed_dim: total dimension of the model.
+ num_heads: parallel attention heads.
+ dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+ bias: add bias as module parameter. Default: True.
+ add_bias_kv: add bias to the key and value sequences at dim=0.
+ add_zero_attn: add a new batch of zeros to the key and
+ value sequences at dim=1.
+ kdim: total number of features in key. Default: None.
+ vdim: total number of features in value. Default: None.
+ Note: if kdim and vdim are None, they will be set to embed_dim such that
+ query, key, and value have the same number of features.
+ Examples::
+ >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+ >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+ """
+ # __annotations__ = {
+ # 'bias_k': torch._jit_internal.Optional[torch.Tensor],
+ # 'bias_v': torch._jit_internal.Optional[torch.Tensor],
+ # }
+ __constants__ = ['q_proj_weight', 'k_proj_weight', 'v_proj_weight', 'in_proj_weight']
+
+ def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
+ super(MultiheadAttention, self).__init__()
+ self.embed_dim = embed_dim
+ self.kdim = kdim if kdim is not None else embed_dim
+ self.vdim = vdim if vdim is not None else embed_dim
+ self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+ self.num_heads = num_heads
+ self.dropout = dropout
+ self.head_dim = embed_dim // num_heads
+ assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+ if self._qkv_same_embed_dim is False:
+ self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+ self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+ self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+ self.register_parameter('in_proj_weight', None)
+ else:
+ self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
+ self.register_parameter('q_proj_weight', None)
+ self.register_parameter('k_proj_weight', None)
+ self.register_parameter('v_proj_weight', None)
+
+ if bias:
+ self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
+ else:
+ self.register_parameter('in_proj_bias', None)
+ self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
+
+ if add_bias_kv:
+ self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
+ self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
+ else:
+ self.bias_k = self.bias_v = None
+
+ self.add_zero_attn = add_zero_attn
+
+ self._reset_parameters()
+
+ def _reset_parameters(self):
+ if self._qkv_same_embed_dim:
+ xavier_uniform_(self.in_proj_weight)
+ else:
+ xavier_uniform_(self.q_proj_weight)
+ xavier_uniform_(self.k_proj_weight)
+ xavier_uniform_(self.v_proj_weight)
+
+ if self.in_proj_bias is not None:
+ constant_(self.in_proj_bias, 0.)
+ constant_(self.out_proj.bias, 0.)
+ if self.bias_k is not None:
+ xavier_normal_(self.bias_k)
+ if self.bias_v is not None:
+ xavier_normal_(self.bias_v)
+
+ def __setstate__(self, state):
+ # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+ if '_qkv_same_embed_dim' not in state:
+ state['_qkv_same_embed_dim'] = True
+
+ super(MultiheadAttention, self).__setstate__(state)
+
+ def forward(self, query, key, value, key_padding_mask=None,
+ need_weights=True, attn_mask=None):
+ # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]
+ r"""
+ Args:
+ query, key, value: map a query and a set of key-value pairs to an output.
+ See "Attention Is All You Need" for more details.
+ key_padding_mask: if provided, specified padding elements in the key will
+ be ignored by the attention. This is an binary mask. When the value is True,
+ the corresponding value on the attention layer will be filled with -inf.
+ need_weights: output attn_output_weights.
+ attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+ the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+ Shape:
+ - Inputs:
+ - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+ the embedding dimension.
+ - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+ the embedding dimension.
+ - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+ the embedding dimension.
+ - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+ If a ByteTensor is provided, the non-zero positions will be ignored while the position
+ with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
+ value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+ - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+ 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+ S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
+ positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+ while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+ is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+ is provided, it will be added to the attention weight.
+ - Outputs:
+ - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+ E is the embedding dimension.
+ - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+ L is the target sequence length, S is the source sequence length.
+ """
+ if not self._qkv_same_embed_dim:
+ return multi_head_attention_forward(
+ query, key, value, self.embed_dim, self.num_heads,
+ self.in_proj_weight, self.in_proj_bias,
+ self.bias_k, self.bias_v, self.add_zero_attn,
+ self.dropout, self.out_proj.weight, self.out_proj.bias,
+ training=self.training,
+ key_padding_mask=key_padding_mask, need_weights=need_weights,
+ attn_mask=attn_mask, use_separate_proj_weight=True,
+ q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+ v_proj_weight=self.v_proj_weight)
+ else:
+ return multi_head_attention_forward(
+ query, key, value, self.embed_dim, self.num_heads,
+ self.in_proj_weight, self.in_proj_bias,
+ self.bias_k, self.bias_v, self.add_zero_attn,
+ self.dropout, self.out_proj.weight, self.out_proj.bias,
+ training=self.training,
+ key_padding_mask=key_padding_mask, need_weights=need_weights,
+ attn_mask=attn_mask)
+
+
+class Transformer(Module):
+ r"""A transformer model. User is able to modify the attributes as needed. The architecture
+ is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer,
+ Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and
+ Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information
+ Processing Systems, pages 6000-6010. Users can build the BERT(https://arxiv.org/abs/1810.04805)
+ model with corresponding parameters.
+
+ Args:
+ d_model: the number of expected features in the encoder/decoder inputs (default=512).
+ nhead: the number of heads in the multiheadattention models (default=8).
+ num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6).
+ num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6).
+ dim_feedforward: the dimension of the feedforward network model (default=2048).
+ dropout: the dropout value (default=0.1).
+ activation: the activation function of encoder/decoder intermediate layer, relu or gelu (default=relu).
+ custom_encoder: custom encoder (default=None).
+ custom_decoder: custom decoder (default=None).
+
+ Examples::
+ >>> transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12)
+ >>> src = torch.rand((10, 32, 512))
+ >>> tgt = torch.rand((20, 32, 512))
+ >>> out = transformer_model(src, tgt)
+
+ Note: A full example to apply nn.Transformer module for the word language model is available in
+ https://github.com/pytorch/examples/tree/master/word_language_model
+ """
+
+ def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
+ num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
+ activation="relu", custom_encoder=None, custom_decoder=None):
+ super(Transformer, self).__init__()
+
+ if custom_encoder is not None:
+ self.encoder = custom_encoder
+ else:
+ encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
+ encoder_norm = LayerNorm(d_model)
+ self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+
+ if custom_decoder is not None:
+ self.decoder = custom_decoder
+ else:
+ decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
+ decoder_norm = LayerNorm(d_model)
+ self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
+
+ self._reset_parameters()
+
+ self.d_model = d_model
+ self.nhead = nhead
+
+ def forward(self, src, tgt, src_mask=None, tgt_mask=None,
+ memory_mask=None, src_key_padding_mask=None,
+ tgt_key_padding_mask=None, memory_key_padding_mask=None):
+ # type: (Tensor, Tensor, Optional[Tensor], Optional[Tensor], Optional[Tensor], Optional[Tensor], Optional[Tensor], Optional[Tensor]) -> Tensor # noqa
+ r"""Take in and process masked source/target sequences.
+
+ Args:
+ src: the sequence to the encoder (required).
+ tgt: the sequence to the decoder (required).
+ src_mask: the additive mask for the src sequence (optional).
+ tgt_mask: the additive mask for the tgt sequence (optional).
+ memory_mask: the additive mask for the encoder output (optional).
+ src_key_padding_mask: the ByteTensor mask for src keys per batch (optional).
+ tgt_key_padding_mask: the ByteTensor mask for tgt keys per batch (optional).
+ memory_key_padding_mask: the ByteTensor mask for memory keys per batch (optional).
+
+ Shape:
+ - src: :math:`(S, N, E)`.
+ - tgt: :math:`(T, N, E)`.
+ - src_mask: :math:`(S, S)`.
+ - tgt_mask: :math:`(T, T)`.
+ - memory_mask: :math:`(T, S)`.
+ - src_key_padding_mask: :math:`(N, S)`.
+ - tgt_key_padding_mask: :math:`(N, T)`.
+ - memory_key_padding_mask: :math:`(N, S)`.
+
+ Note: [src/tgt/memory]_mask ensures that position i is allowed to attend the unmasked
+ positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+ while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+ are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+ is provided, it will be added to the attention weight.
+ [src/tgt/memory]_key_padding_mask provides specified elements in the key to be ignored by
+ the attention. If a ByteTensor is provided, the non-zero positions will be ignored while the zero
+ positions will be unchanged. If a BoolTensor is provided, the positions with the
+ value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+
+ - output: :math:`(T, N, E)`.
+
+ Note: Due to the multi-head attention architecture in the transformer model,
+ the output sequence length of a transformer is same as the input sequence
+ (i.e. target) length of the decode.
+
+ where S is the source sequence length, T is the target sequence length, N is the
+ batch size, E is the feature number
+
+ Examples:
+ >>> output = transformer_model(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)
+ """
+
+ if src.size(1) != tgt.size(1):
+ raise RuntimeError("the batch number of src and tgt must be equal")
+
+ if src.size(2) != self.d_model or tgt.size(2) != self.d_model:
+ raise RuntimeError("the feature number of src and tgt must be equal to d_model")
+
+ memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
+ output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
+ tgt_key_padding_mask=tgt_key_padding_mask,
+ memory_key_padding_mask=memory_key_padding_mask)
+ return output
+
+ def generate_square_subsequent_mask(self, sz):
+ r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
+ Unmasked positions are filled with float(0.0).
+ """
+ mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+ mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+ return mask
+
+ def _reset_parameters(self):
+ r"""Initiate parameters in the transformer model."""
+
+ for p in self.parameters():
+ if p.dim() > 1:
+ xavier_uniform_(p)
+
+
+class TransformerEncoder(Module):
+ r"""TransformerEncoder is a stack of N encoder layers
+
+ Args:
+ encoder_layer: an instance of the TransformerEncoderLayer() class (required).
+ num_layers: the number of sub-encoder-layers in the encoder (required).
+ norm: the layer normalization component (optional).
+
+ Examples::
+ >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+ >>> transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
+ >>> src = torch.rand(10, 32, 512)
+ >>> out = transformer_encoder(src)
+ """
+ __constants__ = ['norm']
+
+ def __init__(self, encoder_layer, num_layers, norm=None):
+ super(TransformerEncoder, self).__init__()
+ self.layers = _get_clones(encoder_layer, num_layers)
+ self.num_layers = num_layers
+ self.norm = norm
+
+ def forward(self, src, mask=None, src_key_padding_mask=None):
+ # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tensor
+ r"""Pass the input through the encoder layers in turn.
+
+ Args:
+ src: the sequence to the encoder (required).
+ mask: the mask for the src sequence (optional).
+ src_key_padding_mask: the mask for the src keys per batch (optional).
+
+ Shape:
+ see the docs in Transformer class.
+ """
+ output = src
+
+ for i, mod in enumerate(self.layers):
+ output = mod(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask)
+
+ if self.norm is not None:
+ output = self.norm(output)
+
+ return output
+
+
+class TransformerDecoder(Module):
+ r"""TransformerDecoder is a stack of N decoder layers
+
+ Args:
+ decoder_layer: an instance of the TransformerDecoderLayer() class (required).
+ num_layers: the number of sub-decoder-layers in the decoder (required).
+ norm: the layer normalization component (optional).
+
+ Examples::
+ >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
+ >>> transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
+ >>> memory = torch.rand(10, 32, 512)
+ >>> tgt = torch.rand(20, 32, 512)
+ >>> out = transformer_decoder(tgt, memory)
+ """
+ __constants__ = ['norm']
+
+ def __init__(self, decoder_layer, num_layers, norm=None):
+ super(TransformerDecoder, self).__init__()
+ self.layers = _get_clones(decoder_layer, num_layers)
+ self.num_layers = num_layers
+ self.norm = norm
+
+ def forward(self, tgt, memory, memory2=None, tgt_mask=None,
+ memory_mask=None, memory_mask2=None, tgt_key_padding_mask=None,
+ memory_key_padding_mask=None, memory_key_padding_mask2=None):
+ # type: (Tensor, Tensor, Optional[Tensor], Optional[Tensor], Optional[Tensor], Optional[Tensor]) -> Tensor
+ r"""Pass the inputs (and mask) through the decoder layer in turn.
+
+ Args:
+ tgt: the sequence to the decoder (required).
+ memory: the sequence from the last layer of the encoder (required).
+ tgt_mask: the mask for the tgt sequence (optional).
+ memory_mask: the mask for the memory sequence (optional).
+ tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+ memory_key_padding_mask: the mask for the memory keys per batch (optional).
+
+ Shape:
+ see the docs in Transformer class.
+ """
+ output = tgt
+
+ for mod in self.layers:
+ output = mod(output, memory, memory2=memory2, tgt_mask=tgt_mask,
+ memory_mask=memory_mask, memory_mask2=memory_mask2,
+ tgt_key_padding_mask=tgt_key_padding_mask,
+ memory_key_padding_mask=memory_key_padding_mask,
+ memory_key_padding_mask2=memory_key_padding_mask2)
+
+ if self.norm is not None:
+ output = self.norm(output)
+
+ return output
+
+class TransformerEncoderLayer(Module):
+ r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
+ This standard encoder layer is based on the paper "Attention Is All You Need".
+ Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+ Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+ Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+ in a different way during application.
+
+ Args:
+ d_model: the number of expected features in the input (required).
+ nhead: the number of heads in the multiheadattention models (required).
+ dim_feedforward: the dimension of the feedforward network model (default=2048).
+ dropout: the dropout value (default=0.1).
+ activation: the activation function of intermediate layer, relu or gelu (default=relu).
+
+ Examples::
+ >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+ >>> src = torch.rand(10, 32, 512)
+ >>> out = encoder_layer(src)
+ """
+
+ def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+ activation="relu", debug=False):
+ super(TransformerEncoderLayer, self).__init__()
+ self.debug = debug
+ self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+ # Implementation of Feedforward model
+ self.linear1 = Linear(d_model, dim_feedforward)
+ self.dropout = Dropout(dropout)
+ self.linear2 = Linear(dim_feedforward, d_model)
+
+ self.norm1 = LayerNorm(d_model)
+ self.norm2 = LayerNorm(d_model)
+ self.dropout1 = Dropout(dropout)
+ self.dropout2 = Dropout(dropout)
+
+ self.activation = _get_activation_fn(activation)
+
+ def __setstate__(self, state):
+ if 'activation' not in state:
+ state['activation'] = F.relu
+ super(TransformerEncoderLayer, self).__setstate__(state)
+
+ def forward(self, src, src_mask=None, src_key_padding_mask=None):
+ # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tensor
+ r"""Pass the input through the encoder layer.
+
+ Args:
+ src: the sequence to the encoder layer (required).
+ src_mask: the mask for the src sequence (optional).
+ src_key_padding_mask: the mask for the src keys per batch (optional).
+
+ Shape:
+ see the docs in Transformer class.
+ """
+ src2, attn = self.self_attn(src, src, src, attn_mask=src_mask,
+ key_padding_mask=src_key_padding_mask)
+ if self.debug: self.attn = attn
+ src = src + self.dropout1(src2)
+ src = self.norm1(src)
+ src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+ src = src + self.dropout2(src2)
+ src = self.norm2(src)
+
+ return src
+
+
+class TransformerDecoderLayer(Module):
+ r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
+ This standard decoder layer is based on the paper "Attention Is All You Need".
+ Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+ Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+ Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+ in a different way during application.
+
+ Args:
+ d_model: the number of expected features in the input (required).
+ nhead: the number of heads in the multiheadattention models (required).
+ dim_feedforward: the dimension of the feedforward network model (default=2048).
+ dropout: the dropout value (default=0.1).
+ activation: the activation function of intermediate layer, relu or gelu (default=relu).
+
+ Examples::
+ >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
+ >>> memory = torch.rand(10, 32, 512)
+ >>> tgt = torch.rand(20, 32, 512)
+ >>> out = decoder_layer(tgt, memory)
+ """
+
+ def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+ activation="relu", self_attn=True, siamese=False, debug=False):
+ super(TransformerDecoderLayer, self).__init__()
+ self.has_self_attn, self.siamese = self_attn, siamese
+ self.debug = debug
+ if self.has_self_attn:
+ self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+ self.norm1 = LayerNorm(d_model)
+ self.dropout1 = Dropout(dropout)
+ self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+ # Implementation of Feedforward model
+ self.linear1 = Linear(d_model, dim_feedforward)
+ self.dropout = Dropout(dropout)
+ self.linear2 = Linear(dim_feedforward, d_model)
+
+ self.norm2 = LayerNorm(d_model)
+ self.norm3 = LayerNorm(d_model)
+ self.dropout2 = Dropout(dropout)
+ self.dropout3 = Dropout(dropout)
+ if self.siamese:
+ self.multihead_attn2 = MultiheadAttention(d_model, nhead, dropout=dropout)
+
+ self.activation = _get_activation_fn(activation)
+
+ def __setstate__(self, state):
+ if 'activation' not in state:
+ state['activation'] = F.relu
+ super(TransformerDecoderLayer, self).__setstate__(state)
+
+ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
+ tgt_key_padding_mask=None, memory_key_padding_mask=None,
+ memory2=None, memory_mask2=None, memory_key_padding_mask2=None):
+ # type: (Tensor, Tensor, Optional[Tensor], Optional[Tensor], Optional[Tensor], Optional[Tensor]) -> Tensor
+ r"""Pass the inputs (and mask) through the decoder layer.
+
+ Args:
+ tgt: the sequence to the decoder layer (required).
+ memory: the sequence from the last layer of the encoder (required).
+ tgt_mask: the mask for the tgt sequence (optional).
+ memory_mask: the mask for the memory sequence (optional).
+ tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+ memory_key_padding_mask: the mask for the memory keys per batch (optional).
+
+ Shape:
+ see the docs in Transformer class.
+ """
+ if self.has_self_attn:
+ tgt2, attn = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask,
+ key_padding_mask=tgt_key_padding_mask)
+ tgt = tgt + self.dropout1(tgt2)
+ tgt = self.norm1(tgt)
+ if self.debug: self.attn = attn
+ tgt2, attn2 = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
+ key_padding_mask=memory_key_padding_mask)
+ if self.debug: self.attn2 = attn2
+
+ if self.siamese:
+ tgt3, attn3 = self.multihead_attn2(tgt, memory2, memory2, attn_mask=memory_mask2,
+ key_padding_mask=memory_key_padding_mask2)
+ tgt = tgt + self.dropout2(tgt3)
+ if self.debug: self.attn3 = attn3
+
+ tgt = tgt + self.dropout2(tgt2)
+ tgt = self.norm2(tgt)
+ tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+ tgt = tgt + self.dropout3(tgt2)
+ tgt = self.norm3(tgt)
+
+ return tgt
+
+
+def _get_clones(module, N):
+ return ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def _get_activation_fn(activation):
+ if activation == "relu":
+ return F.relu
+ elif activation == "gelu":
+ return F.gelu
+
+ raise RuntimeError("activation should be relu/gelu, not {}".format(activation))
+
+
+class PositionalEncoding(nn.Module):
+ r"""Inject some information about the relative or absolute position of the tokens
+ in the sequence. The positional encodings have the same dimension as
+ the embeddings, so that the two can be summed. Here, we use sine and cosine
+ functions of different frequencies.
+ .. math::
+ \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
+ \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
+ \text{where pos is the word position and i is the embed idx)
+ Args:
+ d_model: the embed dim (required).
+ dropout: the dropout value (default=0.1).
+ max_len: the max. length of the incoming sequence (default=5000).
+ Examples:
+ >>> pos_encoder = PositionalEncoding(d_model)
+ """
+
+ def __init__(self, d_model, dropout=0.1, max_len=5000):
+ super(PositionalEncoding, self).__init__()
+ self.dropout = nn.Dropout(p=dropout)
+
+ pe = torch.zeros(max_len, d_model)
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+ pe[:, 0::2] = torch.sin(position * div_term)
+ pe[:, 1::2] = torch.cos(position * div_term)
+ pe = pe.unsqueeze(0).transpose(0, 1)
+ self.register_buffer('pe', pe)
+
+ def forward(self, x):
+ r"""Inputs of forward function
+ Args:
+ x: the sequence fed to the positional encoder model (required).
+ Shape:
+ x: [sequence length, batch size, embed dim]
+ output: [sequence length, batch size, embed dim]
+ Examples:
+ >>> output = pos_encoder(x)
+ """
+
+ x = x + self.pe[:x.size(0), :]
+ return self.dropout(x)
+
+
+if __name__ == '__main__':
+ transformer_model = Transformer(nhead=16, num_encoder_layers=12)
+ src = torch.rand((10, 32, 512))
+ tgt = torch.rand((20, 32, 512))
+ out = transformer_model(src, tgt)
+ print(out)
diff --git a/src/model/TextGen/src/module/abinet/transforms.py b/src/model/TextGen/src/module/abinet/transforms.py
new file mode 100644
index 0000000..5a7042f
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/transforms.py
@@ -0,0 +1,329 @@
+import math
+import numbers
+import random
+
+import cv2
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms import Compose
+
+
+def sample_asym(magnitude, size=None):
+ return np.random.beta(1, 4, size) * magnitude
+
+def sample_sym(magnitude, size=None):
+ return (np.random.beta(4, 4, size=size) - 0.5) * 2 * magnitude
+
+def sample_uniform(low, high, size=None):
+ return np.random.uniform(low, high, size=size)
+
+def get_interpolation(type='random'):
+ if type == 'random':
+ choice = [cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA]
+ interpolation = choice[random.randint(0, len(choice)-1)]
+ elif type == 'nearest': interpolation = cv2.INTER_NEAREST
+ elif type == 'linear': interpolation = cv2.INTER_LINEAR
+ elif type == 'cubic': interpolation = cv2.INTER_CUBIC
+ elif type == 'area': interpolation = cv2.INTER_AREA
+ else: raise TypeError('Interpolation types only nearest, linear, cubic, area are supported!')
+ return interpolation
+
+class CVRandomRotation(object):
+ def __init__(self, degrees=15):
+ assert isinstance(degrees, numbers.Number), "degree should be a single number."
+ assert degrees >= 0, "degree must be positive."
+ self.degrees = degrees
+
+ @staticmethod
+ def get_params(degrees):
+ return sample_sym(degrees)
+
+ def __call__(self, img):
+ angle = self.get_params(self.degrees)
+ src_h, src_w = img.shape[:2]
+ M = cv2.getRotationMatrix2D(center=(src_w/2, src_h/2), angle=angle, scale=1.0)
+ abs_cos, abs_sin = abs(M[0,0]), abs(M[0,1])
+ dst_w = int(src_h * abs_sin + src_w * abs_cos)
+ dst_h = int(src_h * abs_cos + src_w * abs_sin)
+ M[0, 2] += (dst_w - src_w)/2
+ M[1, 2] += (dst_h - src_h)/2
+
+ flags = get_interpolation()
+ return cv2.warpAffine(img, M, (dst_w, dst_h), flags=flags, borderMode=cv2.BORDER_REPLICATE)
+
+class CVRandomAffine(object):
+ def __init__(self, degrees, translate=None, scale=None, shear=None):
+ assert isinstance(degrees, numbers.Number), "degree should be a single number."
+ assert degrees >= 0, "degree must be positive."
+ self.degrees = degrees
+
+ if translate is not None:
+ assert isinstance(translate, (tuple, list)) and len(translate) == 2, \
+ "translate should be a list or tuple and it must be of length 2."
+ for t in translate:
+ if not (0.0 <= t <= 1.0):
+ raise ValueError("translation values should be between 0 and 1")
+ self.translate = translate
+
+ if scale is not None:
+ assert isinstance(scale, (tuple, list)) and len(scale) == 2, \
+ "scale should be a list or tuple and it must be of length 2."
+ for s in scale:
+ if s <= 0:
+ raise ValueError("scale values should be positive")
+ self.scale = scale
+
+ if shear is not None:
+ if isinstance(shear, numbers.Number):
+ if shear < 0:
+ raise ValueError("If shear is a single number, it must be positive.")
+ self.shear = [shear]
+ else:
+ assert isinstance(shear, (tuple, list)) and (len(shear) == 2), \
+ "shear should be a list or tuple and it must be of length 2."
+ self.shear = shear
+ else:
+ self.shear = shear
+
+ def _get_inverse_affine_matrix(self, center, angle, translate, scale, shear):
+ # https://github.com/pytorch/vision/blob/v0.4.0/torchvision/transforms/functional.py#L717
+ from numpy import sin, cos, tan
+
+ if isinstance(shear, numbers.Number):
+ shear = [shear, 0]
+
+ if not isinstance(shear, (tuple, list)) and len(shear) == 2:
+ raise ValueError(
+ "Shear should be a single value or a tuple/list containing " +
+ "two values. Got {}".format(shear))
+
+ rot = math.radians(angle)
+ sx, sy = [math.radians(s) for s in shear]
+
+ cx, cy = center
+ tx, ty = translate
+
+ # RSS without scaling
+ a = cos(rot - sy) / cos(sy)
+ b = -cos(rot - sy) * tan(sx) / cos(sy) - sin(rot)
+ c = sin(rot - sy) / cos(sy)
+ d = -sin(rot - sy) * tan(sx) / cos(sy) + cos(rot)
+
+ # Inverted rotation matrix with scale and shear
+ # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1
+ M = [d, -b, 0,
+ -c, a, 0]
+ M = [x / scale for x in M]
+
+ # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1
+ M[2] += M[0] * (-cx - tx) + M[1] * (-cy - ty)
+ M[5] += M[3] * (-cx - tx) + M[4] * (-cy - ty)
+
+ # Apply center translation: C * RSS^-1 * C^-1 * T^-1
+ M[2] += cx
+ M[5] += cy
+ return M
+
+ @staticmethod
+ def get_params(degrees, translate, scale_ranges, shears, height):
+ angle = sample_sym(degrees)
+ if translate is not None:
+ max_dx = translate[0] * height
+ max_dy = translate[1] * height
+ translations = (np.round(sample_sym(max_dx)), np.round(sample_sym(max_dy)))
+ else:
+ translations = (0, 0)
+
+ if scale_ranges is not None:
+ scale = sample_uniform(scale_ranges[0], scale_ranges[1])
+ else:
+ scale = 1.0
+
+ if shears is not None:
+ if len(shears) == 1:
+ shear = [sample_sym(shears[0]), 0.]
+ elif len(shears) == 2:
+ shear = [sample_sym(shears[0]), sample_sym(shears[1])]
+ else:
+ shear = 0.0
+
+ return angle, translations, scale, shear
+
+
+ def __call__(self, img):
+ src_h, src_w = img.shape[:2]
+ angle, translate, scale, shear = self.get_params(
+ self.degrees, self.translate, self.scale, self.shear, src_h)
+
+ M = self._get_inverse_affine_matrix((src_w/2, src_h/2), angle, (0, 0), scale, shear)
+ M = np.array(M).reshape(2,3)
+
+ startpoints = [(0, 0), (src_w - 1, 0), (src_w - 1, src_h - 1), (0, src_h - 1)]
+ project = lambda x, y, a, b, c: int(a*x + b*y + c)
+ endpoints = [(project(x, y, *M[0]), project(x, y, *M[1])) for x, y in startpoints]
+
+ rect = cv2.minAreaRect(np.array(endpoints))
+ bbox = cv2.boxPoints(rect).astype(dtype=np.int)
+ max_x, max_y = bbox[:, 0].max(), bbox[:, 1].max()
+ min_x, min_y = bbox[:, 0].min(), bbox[:, 1].min()
+
+ dst_w = int(max_x - min_x)
+ dst_h = int(max_y - min_y)
+ M[0, 2] += (dst_w - src_w) / 2
+ M[1, 2] += (dst_h - src_h) / 2
+
+ # add translate
+ dst_w += int(abs(translate[0]))
+ dst_h += int(abs(translate[1]))
+ if translate[0] < 0: M[0, 2] += abs(translate[0])
+ if translate[1] < 0: M[1, 2] += abs(translate[1])
+
+ flags = get_interpolation()
+ return cv2.warpAffine(img, M, (dst_w , dst_h), flags=flags, borderMode=cv2.BORDER_REPLICATE)
+
+class CVRandomPerspective(object):
+ def __init__(self, distortion=0.5):
+ self.distortion = distortion
+
+ def get_params(self, width, height, distortion):
+ offset_h = sample_asym(distortion * height / 2, size=4).astype(dtype=np.int)
+ offset_w = sample_asym(distortion * width / 2, size=4).astype(dtype=np.int)
+ topleft = ( offset_w[0], offset_h[0])
+ topright = (width - 1 - offset_w[1], offset_h[1])
+ botright = (width - 1 - offset_w[2], height - 1 - offset_h[2])
+ botleft = ( offset_w[3], height - 1 - offset_h[3])
+
+ startpoints = [(0, 0), (width - 1, 0), (width - 1, height - 1), (0, height - 1)]
+ endpoints = [topleft, topright, botright, botleft]
+ return np.array(startpoints, dtype=np.float32), np.array(endpoints, dtype=np.float32)
+
+ def __call__(self, img):
+ height, width = img.shape[:2]
+ startpoints, endpoints = self.get_params(width, height, self.distortion)
+ M = cv2.getPerspectiveTransform(startpoints, endpoints)
+
+ # TODO: more robust way to crop image
+ rect = cv2.minAreaRect(endpoints)
+ bbox = cv2.boxPoints(rect).astype(dtype=np.int)
+ max_x, max_y = bbox[:, 0].max(), bbox[:, 1].max()
+ min_x, min_y = bbox[:, 0].min(), bbox[:, 1].min()
+ min_x, min_y = max(min_x, 0), max(min_y, 0)
+
+ flags = get_interpolation()
+ img = cv2.warpPerspective(img, M, (max_x, max_y), flags=flags, borderMode=cv2.BORDER_REPLICATE)
+ img = img[min_y:, min_x:]
+ return img
+
+class CVRescale(object):
+
+ def __init__(self, factor=4, base_size=(128, 512)):
+ """ Define image scales using gaussian pyramid and rescale image to target scale.
+
+ Args:
+ factor: the decayed factor from base size, factor=4 keeps target scale by default.
+ base_size: base size the build the bottom layer of pyramid
+ """
+ if isinstance(factor, numbers.Number):
+ self.factor = round(sample_uniform(0, factor))
+ elif isinstance(factor, (tuple, list)) and len(factor) == 2:
+ self.factor = round(sample_uniform(factor[0], factor[1]))
+ else:
+ raise Exception('factor must be number or list with length 2')
+ # assert factor is valid
+ self.base_h, self.base_w = base_size[:2]
+
+ def __call__(self, img):
+ if self.factor == 0: return img
+ src_h, src_w = img.shape[:2]
+ cur_w, cur_h = self.base_w, self.base_h
+ scale_img = cv2.resize(img, (cur_w, cur_h), interpolation=get_interpolation())
+ for _ in range(self.factor):
+ scale_img = cv2.pyrDown(scale_img)
+ scale_img = cv2.resize(scale_img, (src_w, src_h), interpolation=get_interpolation())
+ return scale_img
+
+class CVGaussianNoise(object):
+ def __init__(self, mean=0, var=20):
+ self.mean = mean
+ if isinstance(var, numbers.Number):
+ self.var = max(int(sample_asym(var)), 1)
+ elif isinstance(var, (tuple, list)) and len(var) == 2:
+ self.var = int(sample_uniform(var[0], var[1]))
+ else:
+ raise Exception('degree must be number or list with length 2')
+
+ def __call__(self, img):
+ noise = np.random.normal(self.mean, self.var**0.5, img.shape)
+ img = np.clip(img + noise, 0, 255).astype(np.uint8)
+ return img
+
+class CVMotionBlur(object):
+ def __init__(self, degrees=12, angle=90):
+ if isinstance(degrees, numbers.Number):
+ self.degree = max(int(sample_asym(degrees)), 1)
+ elif isinstance(degrees, (tuple, list)) and len(degrees) == 2:
+ self.degree = int(sample_uniform(degrees[0], degrees[1]))
+ else:
+ raise Exception('degree must be number or list with length 2')
+ self.angle = sample_uniform(-angle, angle)
+
+ def __call__(self, img):
+ M = cv2.getRotationMatrix2D((self.degree // 2, self.degree // 2), self.angle, 1)
+ motion_blur_kernel = np.zeros((self.degree, self.degree))
+ motion_blur_kernel[self.degree // 2, :] = 1
+ motion_blur_kernel = cv2.warpAffine(motion_blur_kernel, M, (self.degree, self.degree))
+ motion_blur_kernel = motion_blur_kernel / self.degree
+ img = cv2.filter2D(img, -1, motion_blur_kernel)
+ img = np.clip(img, 0, 255).astype(np.uint8)
+ return img
+
+class CVGeometry(object):
+ def __init__(self, degrees=15, translate=(0.3, 0.3), scale=(0.5, 2.),
+ shear=(45, 15), distortion=0.5, p=0.5):
+ self.p = p
+ type_p = random.random()
+ if type_p < 0.33:
+ self.transforms = CVRandomRotation(degrees=degrees)
+ elif type_p < 0.66:
+ self.transforms = CVRandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear)
+ else:
+ self.transforms = CVRandomPerspective(distortion=distortion)
+
+ def __call__(self, img):
+ if random.random() < self.p:
+ img = np.array(img)
+ return Image.fromarray(self.transforms(img))
+ else: return img
+
+class CVDeterioration(object):
+ def __init__(self, var, degrees, factor, p=0.5):
+ self.p = p
+ transforms = []
+ if var is not None:
+ transforms.append(CVGaussianNoise(var=var))
+ if degrees is not None:
+ transforms.append(CVMotionBlur(degrees=degrees))
+ if factor is not None:
+ transforms.append(CVRescale(factor=factor))
+
+ random.shuffle(transforms)
+ transforms = Compose(transforms)
+ self.transforms = transforms
+
+ def __call__(self, img):
+ if random.random() < self.p:
+ img = np.array(img)
+ return Image.fromarray(self.transforms(img))
+ else: return img
+
+
+class CVColorJitter(object):
+ def __init__(self, brightness=0.5, contrast=0.5, saturation=0.5, hue=0.1, p=0.5):
+ self.p = p
+ self.transforms = transforms.ColorJitter(brightness=brightness, contrast=contrast,
+ saturation=saturation, hue=hue)
+
+ def __call__(self, img):
+ if random.random() < self.p: return self.transforms(img)
+ else: return img
diff --git a/src/model/TextGen/src/module/abinet/utils.py b/src/model/TextGen/src/module/abinet/utils.py
new file mode 100644
index 0000000..b3730a8
--- /dev/null
+++ b/src/model/TextGen/src/module/abinet/utils.py
@@ -0,0 +1,206 @@
+import logging
+import os
+import time
+
+import numpy as np
+import torch
+import yaml
+from matplotlib import colors
+from matplotlib import pyplot as plt
+from torch import Tensor, nn
+
+
+def prepare_label(labels, charset, device):
+ gt_ids = []
+ gt_lengths = []
+ for label in labels:
+ length = torch.tensor(len(label) + 1, dtype=torch.long)
+ label = charset.get_labels(label, case_sensitive=False)
+ label = torch.tensor(label, dtype=torch.long)
+ label = CharsetMapper.onehot(label, charset.num_classes)
+ gt_ids.append(label)
+ gt_lengths.append(length)
+ gt_ids = torch.stack(gt_ids).to(device)
+ gt_lengths = torch.stack(gt_lengths).to(device)
+ return gt_ids, gt_lengths
+
+
+class CharsetMapper(object):
+ """A simple class to map ids into strings.
+
+ It works only when the character set is 1:1 mapping between individual
+ characters and individual ids.
+ """
+
+ def __init__(self, filename="", max_length=30, null_char="\u2591"):
+ """Creates a lookup table.
+
+ Args:
+ filename: Path to charset file which maps characters to ids.
+ max_sequence_length: The max length of ids and string.
+ null_char: A unicode character used to replace '' character.
+ the default value is a light shade block '░'.
+ """
+ self.null_char = null_char
+ self.max_length = max_length
+
+ self.label_to_char = self._read_charset(filename)
+ self.char_to_label = dict(map(reversed, self.label_to_char.items()))
+ self.num_classes = len(self.label_to_char)
+
+ def _read_charset(self, filename):
+ """Reads a charset definition from a tab separated text file.
+
+ Args:
+ filename: a path to the charset file.
+
+ Returns:
+ a dictionary with keys equal to character codes and values - unicode
+ characters.
+ """
+ import re
+
+ pattern = re.compile(r"(\d+)\t(.+)")
+ charset = {}
+ self.null_label = 0
+ charset[self.null_label] = self.null_char
+ with open(filename, "r") as f:
+ for i, line in enumerate(f):
+ m = pattern.match(line)
+ assert m, f"Incorrect charset file. line #{i}: {line}"
+ label = int(m.group(1)) + 1
+ char = m.group(2)
+ charset[label] = char
+ return charset
+
+ def trim(self, text):
+ assert isinstance(text, str)
+ return text.replace(self.null_char, "")
+
+ def get_text(self, labels, length=None, padding=True, trim=False):
+ """Returns a string corresponding to a sequence of character ids."""
+ length = length if length else self.max_length
+ labels = [l.item() if isinstance(l, Tensor) else int(l) for l in labels]
+ if padding:
+ labels = labels + [self.null_label] * (length - len(labels))
+ text = "".join([self.label_to_char[label] for label in labels])
+ if trim:
+ text = self.trim(text)
+ return text
+
+ def onehot(label, depth, device=None):
+ """
+ Args:
+ label: shape (n1, n2, ..., )
+ depth: a scalar
+
+ Returns:
+ onehot: (n1, n2, ..., depth)
+ """
+ if not isinstance(label, torch.Tensor):
+ label = torch.tensor(label, device=device)
+ onehot = torch.zeros(label.size() + torch.Size([depth]), device=device)
+ onehot = onehot.scatter_(-1, label.unsqueeze(-1), 1)
+
+ return onehot
+
+ def get_labels(self, text, length=None, padding=True, case_sensitive=False):
+ """Returns the labels of the corresponding text."""
+ length = length if length else self.max_length
+ if padding:
+ text = text + self.null_char * (length - len(text))
+ if not case_sensitive:
+ text = text.lower()
+ labels = [self.char_to_label.get(char, self.null_label) for char in text]
+ return labels
+
+ def pad_labels(self, labels, length=None):
+ length = length if length else self.max_length
+
+ return labels + [self.null_label] * (length - len(labels))
+
+ @property
+ def digits(self):
+ return "0123456789"
+
+ @property
+ def digit_labels(self):
+ return self.get_labels(self.digits, padding=False)
+
+ @property
+ def alphabets(self):
+ all_chars = list(self.char_to_label.keys())
+ valid_chars = []
+ for c in all_chars:
+ if c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
+ valid_chars.append(c)
+ return "".join(valid_chars)
+
+ @property
+ def alphabet_labels(self):
+ return self.get_labels(self.alphabets, padding=False)
+
+
+class Config(object):
+ def __init__(self, config_path, host=True):
+ def __dict2attr(d, prefix=""):
+ for k, v in d.items():
+ if isinstance(v, dict):
+ __dict2attr(v, f"{prefix}{k}_")
+ else:
+ if k == "phase":
+ assert v in ["train", "test"]
+ if k == "stage":
+ assert v in [
+ "pretrain-vision",
+ "pretrain-language",
+ "train-semi-super",
+ "train-super",
+ ]
+ self.__setattr__(f"{prefix}{k}", v)
+
+ assert os.path.exists(config_path), "%s does not exists!" % config_path
+ with open(config_path) as file:
+ config_dict = yaml.load(file, Loader=yaml.FullLoader)
+ with open(os.path.join(config_path, "configs/template.yaml")) as file:
+ default_config_dict = yaml.load(file, Loader=yaml.FullLoader)
+ __dict2attr(default_config_dict)
+ __dict2attr(config_dict)
+ self.global_workdir = os.path.join(self.global_workdir, self.global_name)
+
+ def __getattr__(self, item):
+ attr = self.__dict__.get(item)
+ if attr is None:
+ attr = dict()
+ prefix = f"{item}_"
+ for k, v in self.__dict__.items():
+ if k.startswith(prefix):
+ n = k.replace(prefix, "")
+ attr[n] = v
+ return attr if len(attr) > 0 else None
+ else:
+ return attr
+
+ def __repr__(self):
+ str = "ModelConfig(\n"
+ for i, (k, v) in enumerate(sorted(vars(self).items())):
+ str += f"\t({i}): {k} = {v}\n"
+ str += ")"
+ return str
+
+
+def onehot(label, depth, device=None):
+ """
+ Args:
+ label: shape (n1, n2, ..., )
+ depth: a scalar
+
+ Returns:
+ onehot: (n1, n2, ..., depth)
+ """
+ if not isinstance(label, torch.Tensor):
+ label = torch.tensor(label, device=device)
+ onehot = torch.zeros(label.size() + torch.Size([depth]), device=device)
+ onehot = onehot.scatter_(-1, label.unsqueeze(-1), 1)
+
+ return onehot
diff --git a/src/model/TextGen/src/module/loss/vgg_loss.py b/src/model/TextGen/src/module/loss/vgg_loss.py
new file mode 100644
index 0000000..6068b7a
--- /dev/null
+++ b/src/model/TextGen/src/module/loss/vgg_loss.py
@@ -0,0 +1,66 @@
+import torch
+from torchvision.models import vgg19
+
+
+
+class Vgg19(torch.nn.Module):
+ def __init__(self, vgg19_weights):
+ super(Vgg19, self).__init__()
+ vgg = vgg19(pretrained=False)
+ params = torch.load(vgg19_weights)
+ vgg.load_state_dict(params)
+ features = list(vgg.features)
+ self.features = torch.nn.ModuleList(features).eval()
+
+ def forward(self, x):
+ results = []
+ for ii, model in enumerate(self.features):
+ x = model(x)
+
+ if ii in {1, 6, 11, 20, 29}:
+ results.append(x)
+ return results
+
+def build_l1_loss(x_t, x_o):
+ return torch.mean(torch.abs(x_t - x_o))
+
+
+
+def build_perceptual_loss(x):
+ l = []
+ for i, f in enumerate(x):
+ l.append(build_l1_loss(f[0], f[1]))
+ l = torch.stack(l, dim=0)
+ l = l.sum()
+ return l
+
+
+def build_gram_matrix(x):
+ x_shape = x.shape
+ c, h, w = x_shape[1], x_shape[2], x_shape[3]
+ matrix = x.view((-1, c, h * w))
+ matrix1 = torch.transpose(matrix, 1, 2)
+ gram = torch.matmul(matrix, matrix1) / (h * w * c)
+ return gram
+
+
+def build_style_loss(x):
+ l = []
+ for i, f in enumerate(x):
+ f_shape = f[0].shape[0] * f[0].shape[1] * f[0].shape[2]
+ f_norm = 1. / f_shape
+ gram_true = build_gram_matrix(f[0])
+ gram_pred = build_gram_matrix(f[1])
+ l.append(f_norm * (build_l1_loss(gram_true, gram_pred)))
+ l = torch.stack(l, dim=0)
+ l = l.sum()
+ return l
+
+
+def build_vgg_loss(x):
+ splited = []
+ for i, f in enumerate(x):
+ splited.append(torch.chunk(f, 2))
+ l_per = build_perceptual_loss(splited)
+ l_style = build_style_loss(splited)
+ return l_per, l_style
diff --git a/src/model/TextGen/src/module/textencoder/modules.py b/src/model/TextGen/src/module/textencoder/modules.py
new file mode 100644
index 0000000..78f5e07
--- /dev/null
+++ b/src/model/TextGen/src/module/textencoder/modules.py
@@ -0,0 +1,207 @@
+# reference to https://github.com/ZYM-PKU/UDiffText, many thanks.
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import math
+import string
+from torchvision import transforms
+from timm.models.vision_transformer import VisionTransformer
+
+def autocast(f, enabled=True):
+ def do_autocast(*args, **kwargs):
+ with torch.cuda.amp.autocast(
+ enabled=enabled,
+ dtype=torch.get_autocast_gpu_dtype(),
+ cache_enabled=torch.is_autocast_cache_enabled(),
+ ):
+ return f(*args, **kwargs)
+
+ return do_autocast
+class AbstractEmbModel(nn.Module):
+ def __init__(self):
+ super().__init__()
+ self._is_trainable = None
+ self._ucg_rate = None
+ self._input_key = None
+ self._emb_key = None
+
+ @property
+ def is_trainable(self) -> bool:
+ return self._is_trainable
+
+ @property
+ def ucg_rate(self) -> Union[float, torch.Tensor]:
+ return self._ucg_rate
+
+ @property
+ def input_key(self) -> str:
+ return self._input_key
+
+ @property
+ def emb_key(self) -> str:
+ return self._emb_key
+
+ @is_trainable.setter
+ def is_trainable(self, value: bool):
+ self._is_trainable = value
+
+ @ucg_rate.setter
+ def ucg_rate(self, value: Union[float, torch.Tensor]):
+ self._ucg_rate = value
+
+ @input_key.setter
+ def input_key(self, value: str):
+ self._input_key = value
+
+ @emb_key.setter
+ def emb_key(self, value: str):
+ self._emb_key = value
+
+ @is_trainable.deleter
+ def is_trainable(self):
+ del self._is_trainable
+
+ @ucg_rate.deleter
+ def ucg_rate(self):
+ del self._ucg_rate
+
+ @input_key.deleter
+ def input_key(self):
+ del self._input_key
+
+ @emb_key.deleter
+ def emb_key(self):
+ del self._emb_key
+
+
+
+
+
+class PositionalEncoding(nn.Module):
+
+ def __init__(self, d_model, dropout=0.1, max_len=5000):
+ super(PositionalEncoding, self).__init__()
+
+ self.dropout = nn.Dropout(p=dropout)
+
+ pe = torch.zeros(max_len, d_model)
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+ pe[:, 0::2] = torch.sin(position * div_term)
+ pe[:, 1::2] = torch.cos(position * div_term)
+ self.register_buffer('pe', pe)
+
+ def forward(self, x):
+ x = x + torch.tile(self.pe[None, ...].to(x.device), (x.shape[0], 1, 1))
+ return self.dropout(x)
+
+
+class ViTSTREncoder(VisionTransformer):
+ '''
+ ViTSTREncoder is basically a ViT that uses ViTSTR weights
+ '''
+
+ def __init__(self, size=224, ckpt_path=None, freeze=True, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self.grayscale = transforms.Grayscale()
+ self.resize = transforms.Resize((size, size), transforms.InterpolationMode.BICUBIC, antialias=True)
+
+ self.character = string.printable[:-6]
+ self.reset_classifier(num_classes=len(self.character) + 2)
+
+ if ckpt_path is not None:
+ self.load_state_dict(torch.load(ckpt_path, map_location="cpu"), strict=False)
+
+ if freeze:
+ self.freeze()
+
+ def reset_classifier(self, num_classes):
+ self.num_classes = num_classes
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ def freeze(self):
+ for param in self.parameters():
+ param.requires_grad_(False)
+
+ def forward_features(self, x):
+ B = x.shape[0]
+ x = self.patch_embed(x)
+
+ cls_tokens = self.cls_token.expand(B, -1, -1)
+ x = torch.cat((cls_tokens, x), dim=1)
+ x = x + self.pos_embed
+ x = self.pos_drop(x)
+
+ for blk in self.blocks:
+ x = blk(x)
+
+ x = self.norm(x)
+ return x
+
+ def forward(self, x):
+
+ x = self.forward_features(x)
+
+ return x
+
+ def encode(self, x):
+ return self(x)
+
+
+class LabelEncoder(AbstractEmbModel):
+
+ def __init__(self, max_len, emb_dim, n_heads=8, n_trans_layers=12, ckpt_path=None, trainable=False,
+ lr=1e-4, lambda_cls=0.1, lambda_pos=0.1, clip_dim=1024, visual_len=197, visual_dim=768,
+ visual_config=None, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self.max_len = max_len
+ self.emd_dim = emb_dim
+ self.n_heads = n_heads
+ self.n_trans_layers = n_trans_layers
+ self.character = string.printable[:-6]
+ self.num_cls = len(self.character) + 1
+
+ self.label_embedding = nn.Embedding(self.num_cls, self.emd_dim)
+ self.pos_embedding = PositionalEncoding(d_model=self.emd_dim, max_len=self.max_len)
+ transformer_block = nn.TransformerEncoderLayer(d_model=self.emd_dim, nhead=self.n_heads, batch_first=True)
+ self.encoder = nn.TransformerEncoder(transformer_block, num_layers=self.n_trans_layers)
+
+ if ckpt_path is not None:
+ self.load_state_dict(torch.load(ckpt_path, map_location="cpu"), strict=False)
+
+
+ def freeze(self):
+ for param in self.parameters():
+ param.requires_grad = False
+
+ def get_index(self, labels):
+
+ indexes = []
+ for label in labels:
+ if len(label) > self.max_len:
+ label = label[:self.max_len]
+ index = [self.character.find(c) + 1 for c in label]
+ index = index + [0] * (self.max_len - len(index))
+ indexes.append(index)
+
+ return torch.tensor(indexes, device=next(self.parameters()).device)
+
+ def get_embeddings(self, x):
+
+ emb = self.label_embedding(x)
+ emb = self.pos_embedding(emb)
+ out = self.encoder(emb)
+
+ return out
+
+ def forward(self, labels):
+
+ idx = self.get_index(labels)
+ out = self.get_embeddings(idx)
+
+ return out
+
+ def encode(self, text):
+ return self(text)
diff --git a/src/model/TextGen/src/trainer/Base.py b/src/model/TextGen/src/trainer/Base.py
new file mode 100644
index 0000000..95570ce
--- /dev/null
+++ b/src/model/TextGen/src/trainer/Base.py
@@ -0,0 +1,615 @@
+import pytorch_lightning as pl
+import torch
+import torchvision
+import os
+from omegaconf import OmegaConf
+from diffusers import AutoencoderKL, UNet2DConditionModel
+import sys
+from .utils import load_state_dict
+import torch.nn as nn
+from .utils import (
+ count_params,
+ pl_on_train_tart,
+ module_requires_grad,
+ get_obj_from_str,
+ instantiate_from_config
+)
+import torch.distributed as torchdist
+from typing import Optional, Any, Union, List, Dict
+from src.module.abinet import (
+ ABINetIterModel,
+ MultiLosses,
+ CharsetMapper,
+ prepare_label,
+ postprocess
+)
+import inspect
+from src.module.loss.vgg_loss import (
+ Vgg19,
+ build_vgg_loss
+)
+
+
+
+
+class BaseTrainer(pl.LightningModule):
+ def __init__(self, baseconfig):
+ super().__init__()
+ self.save_hyperparameters()
+ self.data_dtype = torch.float32
+ self.config = baseconfig
+ if os.path.exists(self.config.vae.get("pretrained")):
+ self.vae = AutoencoderKL.from_pretrained(self.config.vae.get("pretrained"))
+ else:
+ print("Initialize vae failed!")
+ sys.exit()
+ self.NORMALIZER = self.config.vae.get("normalizer", 0.18215)
+ self.vae.requires_grad_(False)
+ self.vae.to(self.data_dtype)
+ self.noise_scheduler = get_obj_from_str(self.config.noise_scheduler).from_config(
+ self.config.scheduler_config)
+ self.text_encoder = instantiate_from_config(self.config.text_encoder)
+ if self.config.text_encoder_optimized:
+ self.text_encoder.requires_grad_(True)
+ else:
+ self.text_encoder.requires_grad_(False)
+ self.text_encoder.to(self.data_dtype)
+ self.unet = instantiate_from_config(self.config.unet)
+ self.unet.load_state_dict(load_state_dict(self.config.unet_pretrained, location='cpu'), strict=True)
+ if self.config.ocr_model.get("ocr_supervised", False):
+ print("Initialize ocr model from pretrained...")
+ self.ocr_model = ABINetIterModel(self.config.ocr_model)
+ self.ocr_resize = torchvision.transforms.Resize([self.config.ocr_model.height, self.config.ocr_model.width])
+ if os.path.exists(self.config.ocr_model.get("pretrained")):
+ ocr_model_dict = torch.load(self.config.ocr_model.pretrained)
+ self.ocr_model.load_state_dict(state_dict=ocr_model_dict)
+ else:
+ print("Initialize ocr_model failed!")
+ sys.exit()
+
+ self.ocr_model.to(self.data_dtype)
+ if self.config.ocr_model.get("optimize", False):
+ self.ocr_model.requires_grad_(True)
+ else:
+ print("Frozen ocr model weight...")
+ self.ocr_model.requires_grad_(False)
+ self.charset = CharsetMapper(filename=self.config.ocr_model.charset_path)
+ print("Initialize ocr charsetmapper from {}...".format(self.config.ocr_model.charset_path.rsplit('/', 1)[1]))
+ else:
+ self.ocr_model = None
+
+ self.loss_fn = nn.MSELoss()
+ if self.config.get("vgg_weight", False):
+ self.vgg19 = Vgg19(self.config.vgg_weight).to(self.device)
+ self.vgg19.requires_grad_(False)
+ else:
+ self.vgg19 = None
+ self.count_params()
+
+ def count_params(self):
+ count_params(self.vae)
+ count_params(self.unet)
+ if self.vgg19:
+ count_params(self.vgg19)
+ if self.text_encoder:
+ count_params(self.text_encoder)
+ if self.ocr_model:
+ count_params(self.ocr_model)
+
+
+ def get_text_conditioning(self, c):
+ if hasattr(self.text_encoder, 'encode') and callable(self.text_encoder.encode):
+ c = self.text_encoder.encode(c)
+ else:
+ c = self.text_encoder(c)
+ return c
+
+ def on_train_start(self) -> None:
+ pl_on_train_tart(self)
+
+ def on_fit_start(self) -> None:
+ if torchdist.is_initialized():
+ print(f"Fit synchronize on rank: {torchdist.get_rank()}")
+ torchdist.barrier()
+ torch.cuda.synchronize()
+
+ def configure_optimizers(self):
+ lr = self.learning_rate
+ params = [{"params": self.unet.parameters()}]
+ if self.config.text_encoder.get("optimize", False):
+ print("Optimize text encoder")
+ params.append({"params": self.text_encoder.parameters()})
+ print(
+ f"Initialize optimizer with: lr: {lr}, weight_decay: {self.config.weight_decay}, eps: {self.config.adam_epsilon}"
+ )
+ opt = torch.optim.AdamW(
+ params,
+ lr=lr,
+ weight_decay=self.config.weight_decay,
+ eps=self.config.adam_epsilon,
+ )
+ return opt
+
+ def shared_step(self, batch, batch_idx, stage="train"):
+ loss = self(batch)
+ if len(loss) == 1:
+ loss = loss[0]
+ self.log(
+ f"{stage}_traj/loss",
+ loss,
+ batch_size=len(batch["img"]),
+ prog_bar=True,
+ sync_dist=True,
+ )
+ elif len(loss) == 3:
+ loss, mse_loss, ocr_loss = loss
+ self.log(
+ f"{stage}_traj/loss",
+ loss,
+ batch_size=len(batch["img"]),
+ prog_bar=True,
+ sync_dist=True,
+ )
+ self.log(
+ f"{stage}_traj/latent_loss",
+ mse_loss,
+ batch_size=len(batch["img"]),
+ sync_dist=True,
+ )
+ self.log(
+ f"{stage}_traj/ocr_loss",
+ ocr_loss,
+ batch_size=len(batch["img"]),
+ sync_dist=True,
+ )
+ else:
+ loss, mse_loss, ocr_loss, reconstruction_loss = loss
+ self.log(
+ f"{stage}_traj/loss",
+ loss,
+ batch_size=len(batch["img"]),
+ prog_bar=True,
+ sync_dist=True,
+ )
+ self.log(
+ f"{stage}_traj/latent_loss",
+ mse_loss,
+ batch_size=len(batch["img"]),
+ sync_dist=True,
+ )
+ self.log(
+ f"{stage}_traj/ocr_loss",
+ ocr_loss,
+ batch_size=len(batch["img"]),
+ sync_dist=True,
+ )
+ self.log(
+ f"{stage}_traj/reconstruction_loss",
+ reconstruction_loss,
+ batch_size=len(batch["img"]),
+ sync_dist=True,
+ )
+
+ return loss
+
+ def training_step(self, batch, batch_idx):
+ loss = self.shared_step(batch, batch_idx, stage="train")
+ return loss
+
+ def validation_step(self, batch, batch_idx):
+ loss = self.shared_step(batch, batch_idx, stage="valid")
+ return loss
+
+ def prepare_input(self, batch):
+ target_images = batch["img"].to(self.data_dtype)
+ latents = self.vae.encode(target_images).latent_dist.sample()
+ z0 = latents * self.NORMALIZER
+
+ texts = batch["texts"]
+
+ c = self.get_text_conditioning(batch["cond"])
+
+ noise = torch.randn_like(z0)
+ bsz = latents.shape[0]
+
+ t = torch.randint(
+ self.config.min_training_steps, self.noise_scheduler.num_train_timesteps, (bsz,), dtype=torch.long
+ ).to(self.device)
+
+ zt = self.noise_scheduler.add_noise(
+ z0, noise, t)
+ zt = self.noise_scheduler.scale_model_input(
+ zt, t)
+
+ # OCR Supervised
+ if self.ocr_model:
+ gt_ids, gt_lengths = prepare_label(texts, self.charset, self.device)
+ else:
+ gt_ids = None
+ gt_lengths = None
+ return {"noise": noise, "timestep": t, "latent": zt,
+ "cond": c, "ids": gt_ids, "lengths": gt_lengths}
+
+
+ def apply_model(self, dict_input):
+ t = dict_input["t"]
+ zt = dict_input["zt"]
+ c = dict_input["cond"]
+ noise_pred = self.unet(zt, t, c).sample
+ return noise_pred
+
+ def forward(self, batch):
+ dict_input = self.prepare_input(batch)
+ noise_pred = self.apply_model(dict_input)
+
+ noise = dict_input["noise"]
+ t = dict_input["timestep"]
+ zt = dict_input["latent"]
+ gt_ids = dict_input["ids"]
+ gt_lengths = dict_input["lengths"]
+ mse_loss = nn.MSELoss()(noise_pred, noise)
+ loss = mse_loss
+
+ if self.config.ocr_model.get("ocr_supervised", False):
+ pred_x0 = list()
+ for i in range(len(batch["img"])):
+ self.noise_scheduler.set_timesteps(self.config.num_inference_steps, device=self.vae.device)
+ pred_x0.append(self.noise_scheduler.step(
+ noise_pred[i: i + 1], t[i], zt[i: i + 1]
+ ).pred_original_sample)
+ pred_x0 = torch.cat(pred_x0)
+ pred_x0 = 1.0 / self.NORMALIZER * pred_x0
+ pred_image_x0 = self.convert_latent2image(pred_x0, tocpu=False)
+ if self.config.reconstruction_loss:
+ i_vgg = torch.cat((batch["img"].to(self.data_dtype), pred_image_x0), dim=0)
+ out_vgg = self.vgg19(i_vgg)
+ l_f_vgg_per, l_f_vgg_style = build_vgg_loss(out_vgg)
+ reconstruction_loss = l_f_vgg_per * 0.01 + l_f_vgg_style * 100 + nn.MSELoss()(
+ batch["img"].to(self.data_dtype), pred_image_x0)
+ pred_image_x0 = self.ocr_resize(pred_image_x0)
+ outputs = self.ocr_model(pred_image_x0, mode="train")
+ celoss_inputs = outputs[:3]
+ ocr_loss = MultiLosses(True)(celoss_inputs, gt_ids, gt_lengths)
+
+ if self.config.reconstruction_loss:
+ loss = mse_loss + self.config.ocr_loss_alpha * ocr_loss + reconstruction_loss
+ return (loss, mse_loss, ocr_loss, reconstruction_loss)
+ else:
+ loss = mse_loss + self.config.ocr_loss_alpha * ocr_loss
+ return (loss, mse_loss, ocr_loss)
+ return (loss,)
+
+ @torch.no_grad()
+ def sample_loop(
+ self,
+ latents: torch.Tensor,
+ encoder_hidden_states: torch.Tensor,
+ timesteps: torch.Tensor,
+ do_classifier_free_guidance: bool = True,
+ guidance_scale: float = 2,
+ return_intermediates: Optional[bool] = False,
+ extra_step_kwargs: Optional[Dict] = {},
+ **kwargs,
+ ):
+ intermediates = []
+ res = {}
+
+ for i, t in enumerate(timesteps):
+ latent_model_input = (
+ torch.cat(
+ [latents] * 2) if do_classifier_free_guidance else latents
+ )
+
+ latent_model_input = self.noise_scheduler.scale_model_input(
+ latent_model_input, t
+ ).to(dtype=self.unet.dtype)
+
+ noise_pred = self.unet(
+ latent_model_input, t, encoder_hidden_states=encoder_hidden_states
+ ).sample
+
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (
+ noise_pred_text - noise_pred_uncond
+ )
+
+ scheduler_res = self.noise_scheduler.step(
+ noise_pred, t, latents, **extra_step_kwargs
+ )
+ latents = scheduler_res.prev_sample
+ assert torch.isnan(latents).sum() == 0, print("scheduler_res")
+ if return_intermediates:
+ intermediates.append(scheduler_res.pred_original_sample)
+
+ latents = 1 / self.NORMALIZER * latents
+ res["latents"] = latents
+ if len(intermediates) != 0:
+ intermediates = [1 / self.NORMALIZER * x for x in intermediates]
+ res["intermediates"] = intermediates
+ return res
+
+ def convert_latent2image(self, latent, tocpu=True):
+ image = self.vae.decode(latent).sample
+ image = image.clamp(0, 1)
+ if tocpu:
+ image = image.cpu()
+ return image
+
+ @torch.no_grad()
+ def sample(
+ self,
+ batch: Dict[str, Union[torch.Tensor, List[str]]],
+ guidance_scale: float = 2,
+ num_sample_per_image: int = 1,
+ num_inference_steps: int = 50,
+ eta: float = 0.0,
+ generator: Optional[torch.Generator] = None,
+ return_intermediates: Optional[bool] = False,
+ **kwargs,
+ ):
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ if self.config.cond_on_text_image:
+ cond_texts = batch["cond"].to(self.data_dtype).to(self.device)
+ uncond_texts = torch.zeros_like(batch["cond"].to(self.data_dtype)).to(self.device)
+ else:
+ cond_texts = batch["cond"]
+ uncond_texts = [""] * len(cond_texts)
+ c = self.get_text_conditioning(cond_texts)
+ uc = self.get_text_conditioning(uncond_texts)
+
+
+ B, C, H, W = batch["img"]
+ image_latents = torch.randn((B, C, H // 8, W // 8), device=self.device)
+
+
+ if num_sample_per_image > 1:
+ image_latents = expand_hidden_states(
+ image_latents, num_sample_per_image
+ )
+ uc = expand_hidden_states(
+ uc, num_sample_per_image
+ )
+ c = expand_hidden_states(
+ c, num_sample_per_image
+ )
+
+ encoder_hidden_states = torch.cat([uc, c])
+
+ self.noise_scheduler.set_timesteps(
+ num_inference_steps, device=self.vae.device)
+ timesteps = self.noise_scheduler.timesteps
+ image_latents = image_latents * self.noise_scheduler.init_noise_sigma
+
+ accepts_eta = "eta" in set(
+ inspect.signature(self.noise_scheduler.step).parameters.keys()
+ )
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+ accepts_generator = "generator" in set(
+ inspect.signature(self.noise_scheduler.step).parameters.keys()
+ )
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+ latent_results = self.sample_loop(
+ image_latents,
+ encoder_hidden_states,
+ timesteps,
+ do_classifier_free_guidance,
+ guidance_scale,
+ return_intermediates=return_intermediates,
+ extra_step_kwargs=extra_step_kwargs,
+ **kwargs,
+ )
+
+ image_results = {}
+ images = self.convert_latent2image(latent_results["latents"])
+ image_results["images"] = images
+ if return_intermediates:
+ intermediate_images = [
+ self.convert_latent2image(x) for x in latent_results["intermediates"]
+ ]
+ image_results["intermediate_images"] = intermediate_images
+ return image_results
+
+ @torch.no_grad()
+ def log_images(self, batch, generation_kwargs, stage="train", cat_gt=False):
+ image_results = dict()
+ if (
+ stage == "train" or stage == "validation" or stage == "valid"
+ ):
+ num_sample_per_image = generation_kwargs.get(
+ "num_sample_per_image", 1)
+ sample_results = self.sample(batch, **generation_kwargs)
+ _, _, h, w = batch["img"].shape
+ torch_resize = torchvision.transforms.Resize([h, w])
+ for i, caption in enumerate(batch["texts"]):
+ target_image = batch["img"][i].cpu()
+ target_image = target_image.clamp(0., 1.)
+ target_image = torch_resize(target_image)
+
+
+ cond_image = log_txt_as_img((w, h), batch["texts"][i], self.config.font_path, size= h // 20)
+ cond_image = cond_image.clamp(0., 1.)
+ cond_image = torch_resize(cond_image)
+
+
+ sample_res = sample_results["images"][
+ i * num_sample_per_image: (i + 1) * num_sample_per_image
+ ]
+ if cat_gt:
+ image_results[f"{i}-{caption}"] = torch.cat(
+ [
+ target_image.unsqueeze(0),
+ cond_image.unsqueeze(0),
+ sample_res], dim=0
+ )
+ else:
+ image_results[f"{i}-{caption}"] = sample_res
+
+ return (image_results,)
+
+
+
+from pytorch_lightning import Callback
+from pytorch_lightning.utilities import rank_zero_only
+from pytorch_lightning.loggers import WandbLogger, TensorBoardLogger
+from torchvision.utils import make_grid
+from torchvision.transforms import ToPILImage
+class BaseImageLogger(Callback):
+ def __init__(
+ self,
+ train_batch_frequency,
+ valid_batch_frequency,
+ generation_kwargs,
+ metric_callback=None,
+ disable_wandb=False,
+ ):
+ super().__init__()
+ self.batch_freq = {
+ "train": train_batch_frequency,
+ "valid": valid_batch_frequency,
+ }
+ self.generation_kwargs = OmegaConf.to_container(generation_kwargs)
+ self.metric_callback = metric_callback
+ self.disable_wandb = disable_wandb
+
+ def get_log_dir(self, pl_module):
+ if isinstance(pl_module.logger, WandbLogger):
+ return pl_module.logger.experiment.dir
+ elif isinstance(pl_module.logger, TensorBoardLogger):
+ return pl_module.logger.log_dir
+
+ def logger_log_image(self, pl_module, captions, images, global_step, split):
+ if isinstance(pl_module.logger, WandbLogger) and not self.disable_wandb:
+ pl_module.logger.log_image(
+ key=f"{split}_img/{global_step}",
+ images=images,
+ caption=captions,
+ step=global_step,
+ )
+ elif isinstance(pl_module.logger, TensorBoardLogger):
+ big_grid = make_grid(
+ torch.stack(images),
+ padding=3,
+ pad_value=50,
+ )
+ pl_module.logger.experiment.add_image(
+ f"{split}_img",
+ big_grid,
+ global_step=global_step,
+ )
+ pl_module.logger.experiment.add_text(
+ f"{split}_img_caption",
+ " | ".join(captions),
+ global_step=global_step,
+ )
+
+ @rank_zero_only
+ def save_image(self, pl_module, images, global_step, split):
+ print(f"Log images at: {split}/{global_step}")
+ all_image_grids = []
+ all_captions = []
+ for k in images:
+ grid = make_grid(images[k])
+ all_captions.append(f"{k}")
+ all_image_grids.append(grid)
+
+ path = os.path.join(
+ self.get_log_dir(pl_module), split + "_img", str(global_step)
+ )
+ os.makedirs(path, exist_ok=True)
+ for caption, grid in zip(all_captions, all_image_grids):
+ img = ToPILImage()(grid)
+ img.save(os.path.join(path, caption + ".png"))
+
+ self.logger_log_image(
+ pl_module, all_captions,
+ all_image_grids, global_step, split
+ )
+
+
+
+ def log_img(self, pl_module, batch, batch_idx, split="train"):
+ if (
+ self.check_frequency(batch_idx, split)
+ and hasattr(pl_module, "log_images")
+ and callable(pl_module.log_images)
+ ):
+ is_train = pl_module.training
+ if is_train:
+ pl_module.eval()
+
+ with torch.no_grad():
+ generation_samples = pl_module.log_images(
+ batch,
+ generation_kwargs=self.generation_kwargs,
+ stage=split,
+ cat_gt=True,
+ )
+
+ image_results = generation_samples[0]
+ self.save_image(pl_module, image_results,
+ pl_module.global_step, split)
+ if is_train:
+ pl_module.train()
+
+ return generation_samples
+
+
+ def check_frequency(self, batch_idx, split="train"):
+ if split == "train":
+ if ((batch_idx + 1) % self.batch_freq[split]) == 0:
+ return True
+ else:
+ if (batch_idx % self.batch_freq[split.split("_")[0]]) == 0:
+ return True
+ return False
+
+ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+ self.generation_kwargs["generator"] = torch.Generator(
+ device=pl_module.device
+ ).manual_seed(self.generation_kwargs["seed"])
+ self.log_img(pl_module, batch, batch_idx, split="train")
+
+ def on_validation_batch_end(
+ self, trainer, pl_module, outputs, batch, batch_idx, dataloadr_idx
+ ):
+ if trainer.state.stage == "sanity_check":
+ return
+ self.generation_kwargs["generator"] = torch.Generator(
+ device=pl_module.device
+ ).manual_seed(self.generation_kwargs["seed"])
+ self.log_img(pl_module, batch, batch_idx, split="valid")
+
+
+@torch.no_grad()
+def expand_hidden_states(a: torch.Tensor, num_sample_per_image=1):
+ origin_size = a.shape
+ repeat_size = [1] * len(origin_size)
+ repeat_size[1] = num_sample_per_image
+ a = a.repeat(repeat_size)
+ a = a.view(origin_size[0] * num_sample_per_image, *origin_size[1:])
+ return a
+
+from PIL import Image, ImageDraw, ImageFont
+import numpy as np
+import torchvision.transforms as T
+def log_txt_as_img(wh, xc, font_path, size=10):
+
+ txt = Image.new("RGB", wh, color="white")
+ draw = ImageDraw.Draw(txt)
+ font = ImageFont.truetype(font_path, size=size)
+ nc = int(40 * (wh[0] / 256))
+ lines = "\n".join(xc[start:start + nc] for start in range(0, len(xc), nc))
+
+ try:
+ draw.text((0, 0), lines, fill="black", font=font)
+ except UnicodeEncodeError:
+ print("Cant encode string for logging. Skipping.")
+
+ txt = T.ToTensor()(txt)
+ return txt
+
diff --git a/src/model/TextGen/src/trainer/CtrlBase.py b/src/model/TextGen/src/trainer/CtrlBase.py
new file mode 100644
index 0000000..c431d57
--- /dev/null
+++ b/src/model/TextGen/src/trainer/CtrlBase.py
@@ -0,0 +1,659 @@
+import einops
+import torch
+import inspect
+import torch as th
+from typing import Optional, Any, Union, List, Dict
+import torch.nn as nn
+from diffusers import UNet2DConditionModel
+from dataclasses import dataclass
+from diffusers.utils import BaseOutput, logging
+from src.trainer.Base import BaseTrainer, expand_hidden_states, log_txt_as_img
+import torchvision
+from .utils import (
+ count_params,
+ pl_on_train_tart,
+ module_requires_grad,
+ get_obj_from_str,
+ instantiate_from_config
+)
+import numpy as np
+import torch.nn as nn
+from functools import partial
+import torch.nn.functional as F
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+from einops import rearrange
+from diffusers.models.resnet import ResnetBlock2D
+
+
+class ControlBase(BaseTrainer):
+
+ def __init__(self, control_config, base_config):
+ super().__init__(base_config)
+ self.control_model = instantiate_from_config(control_config)
+ self.control_scales = [1.0] * 13
+
+
+ @torch.no_grad()
+ def prepare_input(self, batch):
+ dict_input = super().prepare_input(batch)
+ hint = batch["hint"]
+ dict_input["hint"] = hint
+ return dict_input
+
+ def apply_model(self, dict_input):
+ t = dict_input["timestep"]
+ zt = dict_input["latent"]
+ c = dict_input["cond"]
+ hint = dict_input["hint"]
+ control = self.control_model(hint, zt, t, c)
+ control = [c * scale for c, scale in zip(control, self.control_scales)]
+ unet = self.unet
+ noise_pred = unet(x=zt, timestep=t, encoder_hidden_states=c, control=control).sample
+ return noise_pred
+
+ @torch.no_grad()
+ def sample_loop(
+ self,
+ latents: torch.Tensor,
+ encoder_hidden_states: torch.Tensor,
+ hint: torch.Tensor,
+ timesteps: torch.Tensor,
+ do_classifier_free_guidance: bool = True,
+ guidance_scale: float = 2,
+ return_intermediates: Optional[bool] = False,
+ extra_step_kwargs: Optional[Dict] = {},
+ **kwargs,
+ ):
+ intermediates = []
+ res = {}
+
+ for i, t in enumerate(timesteps):
+ latent_model_input = (
+ torch.cat(
+ [latents] * 2) if do_classifier_free_guidance else latents
+ )
+ latent_model_input = self.noise_scheduler.scale_model_input(
+ latent_model_input, t
+ ).to(dtype=self.unet.dtype)
+
+
+ if do_classifier_free_guidance:
+ hint_input = torch.cat([hint] * 2)
+ else:
+ hint_input = hint
+ control_input = self.control_model(hint_input, latent_model_input, t, encoder_hidden_states)
+
+ noise_pred = self.unet(
+ x=latent_model_input, timestep=t, encoder_hidden_states=encoder_hidden_states, control=control_input
+ ).sample
+
+ if do_classifier_free_guidance:
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+ noise_pred = noise_pred_uncond + guidance_scale * (
+ noise_pred_text - noise_pred_uncond
+ )
+
+ scheduler_res = self.noise_scheduler.step(
+ noise_pred, t, latents, **extra_step_kwargs
+ )
+ latents = scheduler_res.prev_sample
+ assert torch.isnan(latents).sum() == 0, print("scheduler_res")
+ if return_intermediates:
+ intermediates.append(scheduler_res.pred_original_sample)
+
+ latents = 1 / self.NORMALIZER * latents
+ res["latents"] = latents
+ if len(intermediates) != 0:
+ intermediates = [1 / self.NORMALIZER * x for x in intermediates]
+ res["intermediates"] = intermediates
+ return res
+
+ def convert_latent2image(self, latent, tocpu=True):
+ image = self.vae.decode(latent).sample
+ image = image.clamp(0, 1)
+ if tocpu:
+ image = image.cpu()
+ return image
+
+ @torch.no_grad()
+ def sample(
+ self,
+ batch: Dict[str, Union[torch.Tensor, List[str]]],
+ guidance_scale: float = 2,
+ num_sample_per_image: int = 1,
+ num_inference_steps: int = 50,
+ eta: float = 0.0,
+ generator: Optional[torch.Generator] = None,
+ return_intermediates: Optional[bool] = False,
+ **kwargs,
+ ):
+ do_classifier_free_guidance = guidance_scale > 1.0
+
+ if self.config.cond_on_text_image:
+ cond_texts = batch["cond"].to(self.data_dtype).to(self.device)
+ uncond_texts = torch.zeros_like(batch["cond"].to(self.data_dtype)).to(self.device)
+ else:
+ cond_texts = batch["cond"]
+ uncond_texts = [""] * len(cond_texts)
+ c = self.get_text_conditioning(cond_texts)
+ uc = self.get_text_conditioning(uncond_texts)
+
+ hint = batch["hint"].to(self.device)
+
+
+
+ B, _, H, W = batch["img"].shape
+ image_latents = torch.randn((B, 4, H // 8, W // 8), device=self.device)
+
+ if num_sample_per_image > 1:
+ image_latents = expand_hidden_states(
+ image_latents, num_sample_per_image
+ )
+ uc = expand_hidden_states(
+ uc, num_sample_per_image
+ )
+ c = expand_hidden_states(
+ c, num_sample_per_image
+ )
+ hint = expand_hidden_states(
+ hint, num_sample_per_image
+ )
+ encoder_hidden_states = torch.cat([uc, c])
+
+ self.noise_scheduler.set_timesteps(
+ num_inference_steps, device=self.vae.device)
+ timesteps = self.noise_scheduler.timesteps
+
+ image_latents = image_latents * self.noise_scheduler.init_noise_sigma
+
+ accepts_eta = "eta" in set(
+ inspect.signature(self.noise_scheduler.step).parameters.keys()
+ )
+ extra_step_kwargs = {}
+ if accepts_eta:
+ extra_step_kwargs["eta"] = eta
+ accepts_generator = "generator" in set(
+ inspect.signature(self.noise_scheduler.step).parameters.keys()
+ )
+ if accepts_generator:
+ extra_step_kwargs["generator"] = generator
+
+ latent_results = self.sample_loop(
+ image_latents,
+ encoder_hidden_states,
+ hint,
+ timesteps,
+ do_classifier_free_guidance,
+ guidance_scale,
+ return_intermediates=return_intermediates,
+ extra_step_kwargs=extra_step_kwargs,
+ **kwargs,
+ )
+
+ image_results = {}
+ images = self.convert_latent2image(latent_results["latents"])
+ image_results["images"] = images
+ if return_intermediates:
+ intermediate_images = [
+ self.convert_latent2image(x) for x in latent_results["intermediates"]
+ ]
+ image_results["intermediate_images"] = intermediate_images
+ return image_results
+
+
+ @torch.no_grad()
+ def log_images(self, batch, generation_kwargs, stage="train", cat_gt=False):
+ image_results = dict()
+ if (
+ stage == "train" or stage == "validation" or stage == "valid"
+ ):
+ num_sample_per_image = generation_kwargs.get(
+ "num_sample_per_image", 1)
+
+ sample_results = self.sample(batch, **generation_kwargs)
+
+ _, _, h, w = batch["img"].shape
+ torch_resize = torchvision.transforms.Resize([h, w])
+ for i, caption in enumerate(batch["texts"]):
+ target_image = batch["img"][i].cpu()
+ target_image = target_image.clamp(0., 1.)
+ target_image = torch_resize(target_image)
+
+ style_image = batch["hint"][i].cpu()
+ style_image = style_image.clamp(0., 1.)
+ style_image = torch_resize(style_image)
+
+
+ cond_image = log_txt_as_img((w, h), batch["cond"][i], self.config.font_path, size= h // 20)
+ cond_image = cond_image.clamp(0., 1.)
+ cond_image = torch_resize(cond_image)
+
+
+ sample_res = sample_results["images"][
+ i * num_sample_per_image: (i + 1) * num_sample_per_image
+ ]
+ if cat_gt:
+ image_results[f"{i}-{caption}"] = torch.cat(
+ [
+ target_image.unsqueeze(0),
+ style_image.unsqueeze(0),
+ cond_image.unsqueeze(0),
+ sample_res], dim=0
+ )
+ else:
+ image_results[f"{i}-{caption}"] = sample_res
+
+ return (image_results,)
+
+ def configure_optimizers(self):
+ lr = self.learning_rate
+ params = [{"params": self.control_model.parameters()}]
+ if not self.sd_locked:
+ params.append({"params": self.unet.parameters()})
+ else:
+ for name, parameter in self.unet.named_parameters():
+ parameter.requires_grad = False
+ opt = torch.optim.AdamW(params, lr=lr)
+ return opt
+
+
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+ sample: torch.FloatTensor
+
+class ControlUNetModel(UNet2DConditionModel):
+ def forward(self, x, timestep=None, encoder_hidden_states=None, control=None, return_dict: bool = True,**kwargs):
+ default_overall_up_factor = 2 ** self.num_upsamplers
+
+ forward_upsample_size = False
+ upsample_size = None
+
+ if any(s % default_overall_up_factor != 0 for s in x.shape[-2:]):
+ forward_upsample_size = True
+
+ if self.config.center_input_sample:
+ x = 2 * x - 1.0
+
+ with torch.no_grad():
+ timesteps = timestep
+ if not torch.is_tensor(timesteps):
+ timesteps = torch.tensor([timesteps], dtype=torch.long, device=x.device)
+ elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+ timesteps = timesteps[None].to(x.device)
+
+ timesteps = timesteps.expand(x.shape[0])
+ t_emb = self.time_proj(timesteps)
+ t_emb = t_emb.to(dtype=self.dtype)
+ emb = self.time_embedding(t_emb)
+ x = self.conv_in(x)
+ xs = (x,)
+ for downsample_block in self.down_blocks:
+ if hasattr(downsample_block, "attentions") and downsample_block.attentions is not None:
+ x, res_x = downsample_block(hidden_states=x, temb=emb, encoder_hidden_states=encoder_hidden_states)
+ else:
+ x, res_x = downsample_block(hidden_states=x, temb=emb)
+ xs += res_x
+
+ x = self.mid_block(x, emb, encoder_hidden_states)
+ x += control.pop()
+
+
+ for i, upsample_block in enumerate(self.up_blocks):
+ is_final_block = i == len(self.up_blocks) - 1
+
+ res_xs = xs[-len(upsample_block.resnets):]
+ xs = xs[: -len(upsample_block.resnets)]
+
+ add_res = ()
+ if control is not None:
+ for item in res_xs[::-1]:
+ temp = item + control.pop()
+ add_res += (temp,)
+ add_res = add_res[::-1]
+ else:
+ add_res = res_xs
+
+
+ if not is_final_block and forward_upsample_size:
+ upsample_size = xs[-1].shape[2:]
+
+ if hasattr(upsample_block, "attentions") and upsample_block.attentions is not None:
+ x = upsample_block(
+ hidden_states=x,
+ temb=emb,
+ res_hidden_states_tuple=add_res,
+ encoder_hidden_states=encoder_hidden_states,
+ upsample_size=upsample_size,
+ )
+ else:
+ x = upsample_block(
+ hidden_states=x, temb=emb, res_hidden_states_tuple=add_res, upsample_size=upsample_size
+ )
+ x = self.conv_norm_out(x)
+ x = self.conv_act(x)
+ x = self.conv_out(x)
+
+ if not return_dict:
+ return (x,)
+
+ return UNet2DConditionOutput(sample=x)
+
+
+class PatchEmbed(nn.Module):
+ """
+ Image to Patch Embedding
+ """
+ def __init__(self, img_size=128, patch_size=16, in_chans=3, embed_dim=768):
+ super().__init__()
+ img_size = to_2tuple(img_size)
+ patch_size = to_2tuple(patch_size)
+ num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+ self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.num_patches = num_patches
+
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+ def forward(self, x, **kwargs):
+ B, C, H, W = x.shape
+ assert H == self.img_size[0] and W == self.img_size[1], \
+ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+ x = self.proj(x).flatten(2).transpose(1, 2)
+ return x
+
+
+class Attention(nn.Module):
+ def __init__(
+ self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+ proj_drop=0., attn_head_dim=None):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ if attn_head_dim is not None:
+ head_dim = attn_head_dim
+ all_head_dim = head_dim * self.num_heads
+ self.scale = qk_scale or head_dim ** -0.5
+
+ self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+ if qkv_bias:
+ self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+ self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+ else:
+ self.q_bias = None
+ self.v_bias = None
+
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(all_head_dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ def forward(self, x):
+ B, N, C = x.shape
+ qkv_bias = None
+ if self.q_bias is not None:
+ qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+ qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+ qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2]
+
+ q = q * self.scale
+ attn = (q @ k.transpose(-2, -1))
+
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+class Mlp(nn.Module):
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class DropPath(nn.Module):
+
+ def __init__(self, drop_prob=None):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, x):
+ return drop_path(x, self.drop_prob, self.training)
+
+ def extra_repr(self) -> str:
+ return 'p={}'.format(self.drop_prob)
+
+
+class Block(nn.Module):
+
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+ attn_head_dim=None):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim)
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ if init_values > 0:
+ self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+ self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+ else:
+ self.gamma_1, self.gamma_2 = None, None
+
+ def forward(self, x):
+ if self.gamma_1 is None:
+ x = x + self.drop_path(self.attn(self.norm1(x)))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ else:
+ x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+ x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+ return x
+
+
+def get_sinusoid_encoding_table(n_position, d_hid):
+ ''' Sinusoid position encoding table '''
+ def get_position_angle_vec(position):
+ return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+
+ sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+ sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
+ sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
+
+ return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+
+
+class VisionTransformerEncoder(nn.Module):
+ """
+ Pretrain Vision Transformer backbone.
+ """
+ def __init__(self, img_size=128, patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12,
+ num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+ drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None,
+ use_learnable_pos_emb=False):
+ super().__init__()
+ self.num_classes = num_classes
+ self.num_features = self.embed_dim = embed_dim
+
+ self.patch_embed = PatchEmbed(
+ img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+ num_patches = self.patch_embed.num_patches
+
+ if use_learnable_pos_emb:
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+ else:
+ self.pos_embed = get_sinusoid_encoding_table(num_patches, embed_dim)
+
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+ self.blocks = nn.ModuleList([
+ Block(
+ dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+ init_values=init_values)
+ for i in range(depth)])
+ self.norm = norm_layer(embed_dim)
+ self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ if use_learnable_pos_emb:
+ trunc_normal_(self.pos_embed, std=.02)
+
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ nn.init.xavier_uniform_(m.weight)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+
+ def get_num_layers(self):
+ return len(self.blocks)
+
+ @torch.jit.ignore
+ def no_weight_decay(self):
+ return {'pos_embed', 'cls_token'}
+
+ def get_classifier(self):
+ return self.head
+
+ def reset_classifier(self, num_classes, global_pool=''):
+ self.num_classes = num_classes
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ def forward_features(self, x, mask):
+ x = self.patch_embed(x)
+ x = x + self.pos_embed.type_as(x).to(x.device).clone().detach()
+
+ B, _, C = x.shape
+ if mask == None:
+ x_vis = x.reshape(B, -1, C)
+ else:
+ x_vis = x[~mask].reshape(B, -1, C)
+
+ for blk in self.blocks:
+ x_vis = blk(x_vis)
+
+ x_vis = self.norm(x_vis)
+ return x_vis
+
+ def forward(self, x, mask):
+ x = self.forward_features(x, mask)
+ x = self.head(x)
+ return x
+
+
+class StylePyramidNet(nn.Module):
+ def __init__(
+ self,
+ image_size=256,
+ patch_size=16,
+ in_channels=3,
+ embed_dim=768,
+ model_channels=320,
+ channel_mult=(1, 2, 4, 4),
+ pyramid_sizes=(32, 16, 8, 4),
+ use_checkpoint=True,
+ use_new_attention_order=False,
+ use_scale_shift_norm=False,
+ dims=2,
+ dropout=0,
+ ):
+ super().__init__()
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.model_channels = model_channels
+
+ self.vit = VisionTransformerEncoder(img_size=image_size, patch_size=patch_size, in_chans=in_channels,
+ embed_dim=embed_dim, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True,
+ qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.,
+ norm_layer=partial(nn.LayerNorm, eps=1e-6), init_values=0,
+ use_learnable_pos_emb=False, )
+
+ self.stages = nn.ModuleList([self._make_stage(embed_dim, size)
+ for i, size in enumerate(pyramid_sizes)])
+
+ self.zero_convs = nn.ModuleList([
+ self.make_zero_conv(embed_dim, model_channels * channel_mult[0]),
+ self.make_zero_conv(embed_dim, model_channels * channel_mult[0]),
+ self.make_zero_conv(embed_dim, model_channels * channel_mult[0]),
+ self.make_zero_conv(embed_dim, model_channels * channel_mult[0]),
+ self.make_zero_conv(embed_dim, model_channels * channel_mult[1]),
+ self.make_zero_conv(embed_dim, model_channels * channel_mult[1]),
+ self.make_zero_conv(embed_dim, model_channels * channel_mult[1]),
+ self.make_zero_conv(embed_dim, model_channels * channel_mult[2]),
+ self.make_zero_conv(embed_dim, model_channels * channel_mult[2]),
+ self.make_zero_conv(embed_dim, model_channels * channel_mult[2]),
+ self.make_zero_conv(embed_dim, model_channels * channel_mult[3]),
+ self.make_zero_conv(embed_dim, model_channels * channel_mult[3]),
+ ])
+
+ self.middle_block = nn.ModuleList([
+ ResnetBlock2D(in_channels=embed_dim, down=True),
+ ResnetBlock2D(in_channels=embed_dim, down=True),
+ ])
+ self.middle_block_out = self.make_zero_conv(embed_dim, model_channels*channel_mult[-1])
+ def make_zero_conv(self, in_channels, out_channels):
+ return zero_module(nn.Conv2d(in_channels, out_channels, 1, padding=0))
+
+ def _make_stage(self, features, size):
+ prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
+ conv = nn.Conv2d(features, features, kernel_size=1, bias=False)
+ return nn.Sequential(prior, conv)
+
+ def forward(self, hint, sample=None, timesteps=None, encoder_hidden_states=None):
+ h = self.vit(hint, mask=None)
+ h_in = rearrange(h, 'b (h w) c -> b c h w', h=self.image_size//self.patch_size)
+
+ outs = []
+ for i, stage in enumerate(self.stages):
+ h_out = stage(h_in)
+ for zero_conv in self.zero_convs[3*i: 3*i+3]:
+ outs.append(zero_conv(h_out))
+
+ hmid = h_in
+ for block in self.middle_block:
+ hmid = block(input_tensor=hmid, temb=None)
+ outs.append(self.middle_block_out(hmid))
+
+ return outs
+
+from einops import repeat
+import math
+
+def zero_module(module):
+ for p in module.parameters():
+ p.detach().zero_()
+ return module
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+ if not repeat_only:
+ half = dim // 2
+ freqs = torch.exp(
+ -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+ ).to(device=timesteps.device)
+ args = timesteps[:, None].float() * freqs[None]
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+ if dim % 2:
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+ else:
+ embedding = repeat(timesteps, 'b -> b d', d=dim)
+ return embedding
diff --git a/src/model/TextGen/src/trainer/utils.py b/src/model/TextGen/src/trainer/utils.py
new file mode 100644
index 0000000..6d91953
--- /dev/null
+++ b/src/model/TextGen/src/trainer/utils.py
@@ -0,0 +1,83 @@
+import os
+import json
+import torch
+import argparse
+import importlib
+import torch.nn as nn
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger, TensorBoardLogger
+from pytorch_lightning import Trainer, Callback
+from pytorch_lightning.utilities import rank_zero_only
+
+def get_state_dict(d):
+ return d.get('state_dict', d)
+
+def load_state_dict(ckpt_path, location='cpu'):
+ _, extension = os.path.splitext(ckpt_path)
+ if extension.lower() == ".safetensors":
+ import safetensors.torch
+ state_dict = safetensors.torch.load_file(ckpt_path, device=location)
+ else:
+ state_dict = get_state_dict(torch.load(ckpt_path, map_location=torch.device(location)))
+ state_dict = get_state_dict(state_dict)
+ print(f'Loaded state_dict from [{ckpt_path}]')
+ return state_dict
+
+def count_params(model):
+ total_params = sum(p.numel() for p in model.parameters())
+ print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
+
+
+def get_obj_from_str(string, reload=False):
+ module, cls = string.rsplit(".", 1)
+ if reload:
+ module_imp = importlib.import_module(module)
+ importlib.reload(module_imp)
+ return getattr(importlib.import_module(module, package=None), cls)
+
+
+def instantiate_from_config(config):
+ if not "target" in config:
+ raise KeyError("Expected key `target` to instantiate.")
+ return get_obj_from_str(config["target"])(**config.get("params", dict()))
+
+
+def nondefault_trainer_args(opt):
+ parser = argparse.ArgumentParser()
+ parser = Trainer.add_argparse_args(parser)
+ args = parser.parse_args([])
+ return sorted(k for k in vars(args) if getattr(opt, k) != getattr(args, k))
+
+
+def module_requires_grad(module: nn.Module):
+ for param in module.parameters():
+ if not param.requires_grad:
+ return False
+ return True
+
+@rank_zero_only
+def pl_on_train_tart(pl_module: pl.LightningModule):
+ wandb_logger = pl_module.logger.experiment
+ if isinstance(wandb_logger, WandbLogger):
+ print("Logging code")
+ wandb_logger.log_code(
+ os.getcwd(),
+ include_fn=lambda path: path.endswith(".py") or path.endswith(".ipynb") or path.endswith(".yaml")
+ )
+ elif isinstance(wandb_logger, TensorBoardLogger):
+ print("Logging git info")
+ wandb_logger.log_hyperparams({"git_version": os.popen("git log -1").read().split("\n")[0]})
+
+ print("***** Start training *****")
+ num_samples = len(pl_module.trainer.train_dataloader.dataset)
+ max_epoch = pl_module.trainer.max_epochs
+ total_step = pl_module.trainer.estimated_stepping_batches
+ total_batch_size = round(num_samples * max_epoch / total_step)
+ print(f" Num examples = {num_samples}")
+ print(f" Num Epochs = {max_epoch}")
+ print(f" Total GPU device number: {pl_module.trainer.num_devices}")
+ print(f" Gradient Accumulation steps = {pl_module.trainer.accumulate_grad_batches}")
+ print(f" Instant batch size: {round(total_batch_size * pl_module.trainer.num_devices)}")
+ print(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ print(f" Total optimization steps = {total_step}")
+
\ No newline at end of file
diff --git a/src/model/TextGen/train.py b/src/model/TextGen/train.py
new file mode 100644
index 0000000..6fbae15
--- /dev/null
+++ b/src/model/TextGen/train.py
@@ -0,0 +1,30 @@
+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+from src.dataset.textdata import TextDataset
+from src.trainer.Base import BaseImageLogger
+from utils import create_model, load_state_dict, create_data
+from omegaconf import OmegaConf
+from pytorch_lightning.callbacks import ModelCheckpoint
+from src.trainer.utils import instantiate_from_config
+
+learning_rate = 1e-5
+checkpoint_freq = 10
+sd_locked = False
+
+cfg_path = "configs/train.yaml"
+model = create_model(cfg_path).cpu()
+vit_path = "weights/style_encoder.pth"
+model.load_state_dict(load_state_dict(vit_path, location='cpu'), strict=False)
+model.learning_rate = learning_rate
+model.sd_locked = sd_locked
+
+cfgs = OmegaConf.load(cfg_path)
+data_config = cfgs.pop("data", OmegaConf.create())
+data = create_data(data_config)
+data.prepare_data()
+data.setup(stage='fit')
+
+checkpoint_callback = ModelCheckpoint(every_n_epochs=checkpoint_freq, save_top_k=-1)
+imagelogger_callback = instantiate_from_config(cfgs.image_logger)
+trainer = pl.Trainer(precision=32, callbacks=[imagelogger_callback, checkpoint_callback], **cfgs.lightning)
+trainer.fit(model=model, datamodule=data)
diff --git a/src/model/TextGen/utils.py b/src/model/TextGen/utils.py
new file mode 100644
index 0000000..fc9f065
--- /dev/null
+++ b/src/model/TextGen/utils.py
@@ -0,0 +1,41 @@
+import os
+import torch
+import importlib
+from omegaconf import OmegaConf
+from src.trainer.utils import instantiate_from_config
+
+
+def get_state_dict(d):
+ return d.get('state_dict', d)
+
+
+def load_state_dict(ckpt_path, location='cpu'):
+ _, extension = os.path.splitext(ckpt_path)
+ if extension.lower() == ".safetensors":
+ import safetensors.torch
+ state_dict = safetensors.torch.load_file(ckpt_path, device=location)
+ else:
+ state_dict = get_state_dict(torch.load(ckpt_path, map_location=torch.device(location)))
+ state_dict = get_state_dict(state_dict)
+ print(f'Loaded state_dict from [{ckpt_path}]')
+ return state_dict
+
+
+def create_model(config_path):
+ config = OmegaConf.load(config_path)
+ model = instantiate_from_config(config.model).cpu()
+ print(f'Loaded model config from [{config_path}]')
+ return model
+
+def get_obj_from_str(string, reload=False, invalidate_cache=True):
+ module, cls = string.rsplit(".", 1)
+ if invalidate_cache:
+ importlib.invalidate_caches()
+ if reload:
+ module_imp = importlib.import_module(module)
+ importlib.reload(module_imp)
+ return getattr(importlib.import_module(module, package=None), cls)
+def create_data(config):
+ data_cls = get_obj_from_str(config.target)
+ data = data_cls(data_config=config)
+ return data
\ No newline at end of file