tracyjxm commited on Jun 24, 2025

Commit

b31d63a

verified ·

1 Parent(s): 1bf77b5

Upload folder using huggingface_hub

Browse files

Files changed (35) hide show

kandinsky3/.ipynb_checkpoints/__init__-checkpoint.py +267 -0
kandinsky3/.ipynb_checkpoints/condition_encoders-checkpoint.py +40 -0
kandinsky3/.ipynb_checkpoints/condition_processors-checkpoint.py +34 -0
kandinsky3/.ipynb_checkpoints/inpainting_pipeline-checkpoint.py +168 -0
kandinsky3/.ipynb_checkpoints/movq-checkpoint.py +431 -0
kandinsky3/.ipynb_checkpoints/t2i_pipeline-checkpoint.py +109 -0
kandinsky3/.ipynb_checkpoints/utils-checkpoint.py +71 -0
kandinsky3/__init__.py +267 -0
kandinsky3/__pycache__/__init__.cpython-310.pyc +0 -0
kandinsky3/__pycache__/condition_encoders.cpython-310.pyc +0 -0
kandinsky3/__pycache__/condition_processors.cpython-310.pyc +0 -0
kandinsky3/__pycache__/inpainting_pipeline.cpython-310.pyc +0 -0
kandinsky3/__pycache__/movq.cpython-310.pyc +0 -0
kandinsky3/__pycache__/t2i_pipeline.cpython-310.pyc +0 -0
kandinsky3/__pycache__/utils.cpython-310.pyc +0 -0
kandinsky3/condition_encoders.py +40 -0
kandinsky3/condition_processors.py +34 -0
kandinsky3/inpainting_pipeline.py +168 -0
kandinsky3/model/.ipynb_checkpoints/diffusion-checkpoint.py +200 -0
kandinsky3/model/.ipynb_checkpoints/unet-checkpoint.py +516 -0
kandinsky3/model/__init__.py +0 -0
kandinsky3/model/__pycache__/__init__.cpython-310.pyc +0 -0
kandinsky3/model/__pycache__/diffusion.cpython-310.pyc +0 -0
kandinsky3/model/__pycache__/nn.cpython-310.pyc +0 -0
kandinsky3/model/__pycache__/unet.cpython-310.pyc +0 -0
kandinsky3/model/__pycache__/utils.cpython-310.pyc +0 -0
kandinsky3/model/diffusion.py +200 -0
kandinsky3/model/nn.py +84 -0
kandinsky3/model/unet.py +516 -0
kandinsky3/model/utils.py +62 -0
kandinsky3/movq.py +431 -0
kandinsky3/setup.py +38 -0
kandinsky3/t2i_pipeline.py +109 -0
kandinsky3/utils.py +71 -0
unet_model_checkpoint.pt +3 -0

kandinsky3/.ipynb_checkpoints/__init__-checkpoint.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import os
+from typing import Optional, Union
+import torch
+from huggingface_hub import hf_hub_download, snapshot_download
+from kandinsky3.model.unet import UNet
+from kandinsky3.movq import MoVQ
+from kandinsky3.condition_encoders import T5TextConditionEncoder
+from kandinsky3.condition_processors import T5TextConditionProcessor
+from kandinsky3.model.diffusion import BaseDiffusion, get_named_beta_schedule
+from .t2i_pipeline import Kandinsky3T2IPipeline
+from .inpainting_pipeline import Kandinsky3InpaintingPipeline
+def get_T2I_unet(
+        device: Union[str, torch.device],
+        weights_path: Optional[str] = None,
+        dtype: Union[str, torch.dtype] = torch.float32,
+) -> (UNet, Optional[torch.Tensor], Optional[dict]):
+    unet = UNet(
+        model_channels=384,
+        num_channels=4,
+        init_channels=192,
+        time_embed_dim=1536,
+        context_dim=4096,
+        groups=32,
+        head_dim=64,
+        expansion_ratio=4,
+        compression_ratio=2,
+        dim_mult=(1, 2, 4, 8),
+        num_blocks=(3, 3, 3, 3),
+        add_cross_attention=(False, True, True, True),
+        add_self_attention=(False, True, True, True),
+    )
+    null_embedding = None
+    if weights_path:
+        state_dict = torch.load(weights_path, map_location=torch.device('cpu'))
+        null_embedding = state_dict['null_embedding']
+        unet.load_state_dict(state_dict['unet'])
+    unet.to(device=device, dtype=dtype).eval()
+    return unet, null_embedding
+def get_T5encoder(
+        device: Union[str, torch.device],
+        weights_path: str,
+        projection_name: str,
+        dtype: Union[str, torch.dtype] = torch.float32,
+        low_cpu_mem_usage: bool = True,
+        load_in_8bit: bool = False,
+        load_in_4bit: bool = False,
+) -> (T5TextConditionProcessor, T5TextConditionEncoder):
+    tokens_length = 128
+    context_dim = 4096
+    processor = T5TextConditionProcessor(tokens_length, weights_path)
+    condition_encoder = T5TextConditionEncoder(
+        weights_path, context_dim, low_cpu_mem_usage=low_cpu_mem_usage, device=device,
+        dtype=dtype, load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit
+    )
+    if weights_path:
+        projections_weights_path = os.path.join(weights_path, projection_name)
+        state_dict = torch.load(projections_weights_path, map_location=torch.device('cpu'))
+        condition_encoder.projection.load_state_dict(state_dict)
+    condition_encoder.projection.to(device=device, dtype=dtype).eval()
+    return processor, condition_encoder
+def get_movq(
+        device: Union[str, torch.device],
+        weights_path: Optional[str] = None,
+        dtype: Union[str, torch.dtype] = torch.float32,
+) -> MoVQ:
+    generator_config = {
+        'double_z': False,
+        'z_channels': 4,
+        'resolution': 256,
+        'in_channels': 3,
+        'out_ch': 3,
+        'ch': 256,
+        'ch_mult': [1, 2, 2, 4],
+        'num_res_blocks': 2,
+        'attn_resolutions': [32],
+        'dropout': 0.0
+    }
+    movq = MoVQ(generator_config)
+    if weights_path:
+        state_dict = torch.load(weights_path, map_location=torch.device('cpu'))
+        movq.load_state_dict(state_dict)
+    movq.to(device=device, dtype=dtype).eval()
+    return movq
+def get_inpainting_unet(
+        device: Union[str, torch.device],
+        weights_path: Optional[str] = None,
+        dtype: Union[str, torch.dtype] = torch.float32,
+) -> (UNet, Optional[torch.Tensor], Optional[dict]):
+    unet = UNet(
+        model_channels=384,
+        num_channels=9,
+        init_channels=192,
+        time_embed_dim=1536,
+        context_dim=4096,
+        groups=32,
+        head_dim=64,
+        expansion_ratio=4,
+        compression_ratio=2,
+        dim_mult=(1, 2, 4, 8),
+        num_blocks=(3, 3, 3, 3),
+        add_cross_attention=(False, True, True, True),
+        add_self_attention=(False, True, True, True),
+    )
+    null_embedding = None
+    if weights_path:
+        state_dict = torch.load(weights_path, map_location=torch.device('cpu'))
+        null_embedding = state_dict['null_embedding']
+        unet.load_state_dict(state_dict['unet'])
+    unet.to(device=device, dtype=dtype).eval()
+    return unet, null_embedding
+def get_T2I_pipeline(
+        device_map: Union[str, torch.device, dict],
+        dtype_map: Union[str, torch.dtype, dict] = torch.float32,
+        low_cpu_mem_usage: bool = True,
+        load_in_8bit: bool = False,
+        load_in_4bit: bool = False,
+        cache_dir: str = '/tmp/kandinsky3/',
+        unet_path: str = None,
+        text_encoder_path: str = None,
+        movq_path: str = None,
+) -> Kandinsky3T2IPipeline:
+    # assert ((unet_path is not None) or (text_encoder_path is not None) or (movq_path is not None))
+    if not isinstance(device_map, dict):
+        device_map = {
+            'unet': device_map, 'text_encoder': device_map, 'movq': device_map
+        }
+    if not isinstance(dtype_map, dict):
+        dtype_map = {
+            'unet': dtype_map, 'text_encoder': dtype_map, 'movq': dtype_map
+        }
+    if unet_path is None:
+        unet_path = hf_hub_download(
+            repo_id="ai-forever/Kandinsky3.1", filename='weights/kandinsky3.pt', cache_dir=cache_dir
+        )
+    if text_encoder_path is None:
+        text_encoder_path = snapshot_download(
+            repo_id="ai-forever/Kandinsky3.1", allow_patterns="weights/flan_ul2_encoder/*", cache_dir=cache_dir
+        )
+        text_encoder_path = os.path.join(text_encoder_path, 'weights/flan_ul2_encoder')
+    if movq_path is None:
+        movq_path = hf_hub_download(
+            repo_id="ai-forever/Kandinsky3.1", filename='weights/movq.pt', cache_dir=cache_dir
+        )
+    unet, null_embedding = get_T2I_unet(device_map['unet'], unet_path, dtype=dtype_map['unet'])
+    processor, condition_encoder = get_T5encoder(
+        device_map['text_encoder'], text_encoder_path, 'projection.pt', dtype=dtype_map['text_encoder'],
+        low_cpu_mem_usage=low_cpu_mem_usage, load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit
+    )
+    movq = get_movq(device_map['movq'], movq_path, dtype=dtype_map['movq'])
+    return Kandinsky3T2IPipeline(
+        device_map, dtype_map, unet, null_embedding, processor, condition_encoder, movq, False
+    )
+def get_T2I_Flash_pipeline(
+        device_map: Union[str, torch.device, dict],
+        dtype_map: Union[str, torch.dtype, dict] = torch.float32,
+        low_cpu_mem_usage: bool = True,
+        load_in_8bit: bool = False,
+        load_in_4bit: bool = False,
+        cache_dir: str = '/tmp/kandinsky3/',
+        unet_path: str = None,
+        text_encoder_path: str = None,
+        movq_path: str = None,
+) -> Kandinsky3T2IPipeline:
+    # assert ((unet_path is not None) or (text_encoder_path is not None) or (movq_path is not None))
+    if not isinstance(device_map, dict):
+        device_map = {
+            'unet': device_map, 'text_encoder': device_map, 'movq': device_map
+        }
+    if not isinstance(dtype_map, dict):
+        dtype_map = {
+            'unet': dtype_map, 'text_encoder': dtype_map, 'movq': dtype_map
+        }
+    if unet_path is None:
+        unet_path = hf_hub_download(
+            repo_id="ai-forever/Kandinsky3.1", filename='weights/kandinsky3_flash.pt', cache_dir=cache_dir
+        )
+    if text_encoder_path is None:
+        text_encoder_path = snapshot_download(
+            repo_id="ai-forever/Kandinsky3.1", allow_patterns="weights/flan_ul2_encoder/*", cache_dir=cache_dir
+        )
+        text_encoder_path = os.path.join(text_encoder_path, 'weights/flan_ul2_encoder')
+    if movq_path is None:
+        movq_path = hf_hub_download(
+            repo_id="ai-forever/Kandinsky3.1", filename='weights/movq.pt', cache_dir=cache_dir
+        )
+    unet, null_embedding = get_T2I_unet(device_map['unet'], unet_path, dtype=dtype_map['unet'])
+    processor, condition_encoder = get_T5encoder(
+        device_map['text_encoder'], text_encoder_path, 'projection_flash.pt', dtype=dtype_map['text_encoder'],
+        low_cpu_mem_usage=low_cpu_mem_usage, load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit
+    )
+    movq = get_movq(device_map['movq'], movq_path, dtype=dtype_map['movq'])
+    return Kandinsky3T2IPipeline(
+        device_map, dtype_map, unet, null_embedding, processor, condition_encoder, movq, True
+    )
+def get_inpainting_pipeline(
+        device_map: Union[str, torch.device, dict],
+        dtype_map: Union[str, torch.dtype, dict] = torch.float32,
+        low_cpu_mem_usage: bool = True,
+        load_in_8bit: bool = False,
+        load_in_4bit: bool = False,
+        cache_dir: str = '/tmp/kandinsky3/',
+        unet_path: str = None,
+        text_encoder_path: str = None,
+        movq_path: str = None,
+) -> Kandinsky3InpaintingPipeline:
+    # assert ((unet_path is not None) or (text_encoder_path is not None) or (movq_path is not None))
+    if not isinstance(device_map, dict):
+        device_map = {
+            'unet': device_map, 'text_encoder': device_map, 'movq': device_map
+        }
+    if not isinstance(dtype_map, dict):
+        dtype_map = {
+            'unet': dtype_map, 'text_encoder': dtype_map, 'movq': dtype_map
+        }
+    if unet_path is None:
+        unet_path = hf_hub_download(
+            repo_id="ai-forever/Kandinsky3.1", filename='weights/kandinsky3_inpainting.pt', cache_dir=cache_dir
+        )
+    if text_encoder_path is None:
+        text_encoder_path = snapshot_download(
+            repo_id="ai-forever/Kandinsky3.1", allow_patterns="weights/flan_ul2_encoder/*", cache_dir=cache_dir
+        )
+        text_encoder_path = os.path.join(text_encoder_path, 'weights/flan_ul2_encoder')
+    if movq_path is None:
+        movq_path = hf_hub_download(
+            repo_id="ai-forever/Kandinsky3.1", filename='weights/movq.pt', cache_dir=cache_dir
+        )
+    unet, null_embedding = get_inpainting_unet(device_map['unet'], unet_path, dtype=dtype_map['unet'])
+    processor, condition_encoder = get_T5encoder(
+        device_map['text_encoder'], text_encoder_path, 'projection_inpainting.pt', dtype=dtype_map['text_encoder'],
+        low_cpu_mem_usage=low_cpu_mem_usage, load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit
+    )
+    movq = get_movq(device_map['movq'], movq_path, dtype=dtype_map['movq'])
+    return Kandinsky3InpaintingPipeline(
+        device_map, dtype_map, unet, null_embedding, processor, condition_encoder, movq
+    )

kandinsky3/.ipynb_checkpoints/condition_encoders-checkpoint.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from torch import nn
+from transformers import T5EncoderModel
+from typing import Optional, Union
+class T5TextConditionEncoder(nn.Module):
+    def __init__(
+            self, model_path, context_dim,
+            low_cpu_mem_usage: bool = True, device: Optional[str] = None,
+            dtype: Union[str, torch.dtype] = torch.float32, load_in_4bit: bool = False, load_in_8bit: bool = False
+    ):
+        super().__init__()
+        self.encoder = T5EncoderModel.from_pretrained(
+            model_path, low_cpu_mem_usage=low_cpu_mem_usage, device_map=device,
+            torch_dtype=dtype, load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit,
+        ).encoder
+        self.projection = nn.Sequential(
+            nn.Linear(self.encoder.config.d_model, context_dim, bias=False),
+            nn.LayerNorm(context_dim)
+        )
+    def forward(self, model_input):
+        embeddings = self.encoder(**model_input).last_hidden_state
+        context = self.projection(embeddings)
+        if 'attention_mask' in model_input:
+            context_mask = model_input['attention_mask']
+            context[context_mask == 0] = torch.zeros_like(context[context_mask == 0])
+            max_seq_length = context_mask.sum(-1).max() + 1
+            context = context[:, :max_seq_length]
+            context_mask = context_mask[:, :max_seq_length]
+        else:
+            context_mask = torch.ones(*embeddings.shape[:-1], dtype=torch.long, device=embeddings.device)
+        return context, context_mask
+def get_condition_encoder(conf):
+    return T5TextConditionEncoder(**conf)

kandinsky3/.ipynb_checkpoints/condition_processors-checkpoint.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+from transformers import T5Tokenizer
+class T5TextConditionProcessor:
+    def __init__(self, tokens_length, processor_path):
+        self.tokens_length = tokens_length
+        self.processor = T5Tokenizer.from_pretrained(processor_path)
+    def encode(self, text=None, negative_text=None):
+        encoded = self.processor(text, max_length=self.tokens_length, truncation=True)
+        pad_length = self.tokens_length - len(encoded['input_ids'])
+        input_ids = encoded['input_ids'] + [self.processor.pad_token_id] * pad_length
+        attention_mask = encoded['attention_mask'] + [0] * pad_length
+        condition_model_input = {
+            'input_ids': torch.tensor(input_ids, dtype=torch.long),
+            'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
+        }
+        if negative_text is not None:
+            negative_encoded = self.processor(negative_text, max_length=self.tokens_length, truncation=True)
+            negative_input_ids = negative_encoded['input_ids'][:len(encoded['input_ids'])]
+            negative_input_ids[-1] = self.processor.eos_token_id
+            negative_pad_length = self.tokens_length - len(negative_input_ids)
+            negative_input_ids = negative_input_ids + [self.processor.pad_token_id] * negative_pad_length
+            negative_attention_mask = encoded['attention_mask'] + [0] * pad_length
+            negative_condition_model_input = {
+                'input_ids': torch.tensor(negative_input_ids, dtype=torch.long),
+                'attention_mask': torch.tensor(negative_attention_mask, dtype=torch.long)
+            }
+        else:
+            negative_condition_model_input = None
+        return condition_model_input, negative_condition_model_input

kandinsky3/.ipynb_checkpoints/inpainting_pipeline-checkpoint.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from typing import Union, List
+import PIL
+import numpy as np
+import torch
+import torchvision.transforms as T
+from einops import repeat
+from kandinsky3.model.unet import UNet
+from kandinsky3.movq import MoVQ
+from kandinsky3.condition_encoders import T5TextConditionEncoder
+from kandinsky3.condition_processors import T5TextConditionProcessor
+from kandinsky3.model.diffusion import BaseDiffusion, get_named_beta_schedule
+from kandinsky3.utils import resize_image_for_diffusion, resize_mask_for_diffusion
+class Kandinsky3InpaintingPipeline:
+    def __init__(
+            self,
+            device_map: Union[str, torch.device, dict],
+            dtype_map: Union[str, torch.dtype, dict],
+            unet: UNet,
+            null_embedding: torch.Tensor,
+            t5_processor: T5TextConditionProcessor,
+            t5_encoder: T5TextConditionEncoder,
+            movq: MoVQ,
+    ):
+        self.device_map = device_map
+        self.dtype_map = dtype_map
+        self.to_pil = T.ToPILImage()
+        self.to_tensor = T.ToTensor()
+        self.unet = unet
+        self.null_embedding = null_embedding
+        self.t5_processor = t5_processor
+        self.t5_encoder = t5_encoder
+        self.movq = movq
+    def shared_step(self, batch: dict) -> dict:
+        image = batch['image']
+        condition_model_input = batch['text']
+        negative_condition_model_input = batch['negative_text']
+        bs = image.shape[0]
+        masked_latent = None
+        mask = batch['mask']
+        if 'masked_image' in batch:
+            masked_latent = batch['masked_image']
+        elif self.unet.in_layer.in_channels == 9:
+            masked_latent = image.masked_fill((1 - mask).bool(), 0)
+        else:
+            raise ValueError()
+        with torch.cuda.amp.autocast(dtype=self.dtype_map['movq']):
+            masked_latent = self.movq.encode(masked_latent)
+        mask = torch.nn.functional.interpolate(mask, size=(masked_latent.shape[2], masked_latent.shape[3]))
+        with torch.cuda.amp.autocast(dtype=self.dtype_map['text_encoder']):
+            context, context_mask = self.t5_encoder(condition_model_input)
+        if negative_condition_model_input is not None:
+            negative_context, negative_context_mask = self.t5_encoder(negative_condition_model_input)
+        else:
+            negative_context, negative_context_mask = None, None
+        return {
+            'context': context,
+            'context_mask': context_mask,
+            'negative_context': negative_context,
+            'negative_context_mask': negative_context_mask,
+            'image': image,
+            'masked_latent': masked_latent,
+            'mask': mask
+        }
+    def prepare_batch(
+            self,
+            text: str,
+            negative_text: str,
+            image: PIL.Image.Image,
+            mask: np.ndarray,
+    ) -> dict:
+        condition_model_input, negative_condition_model_input = self.t5_processor.encode(
+            text=text, negative_text=negative_text
+        )
+        batch = {
+            'image': self.to_tensor(resize_image_for_diffusion(image.convert("RGB"))) * 2 - 1,
+            'mask': 1 - self.to_tensor(resize_mask_for_diffusion(mask)),
+            'text': condition_model_input,
+            'negative_text': negative_condition_model_input
+        }
+        batch['mask'] = batch['mask'].type(self.dtype_map['movq'])
+        batch['image'] = batch['image'].unsqueeze(0).to(self.device_map['movq'])
+        batch['text']['input_ids'] = batch['text']['input_ids'].unsqueeze(0).to(self.device_map['text_encoder'])
+        batch['text']['attention_mask'] = batch['text']['attention_mask'].unsqueeze(0).to(
+            self.device_map['text_encoder'])
+        batch['mask'] = batch['mask'].unsqueeze(0).to(self.device_map['movq'])
+        if negative_condition_model_input is not None:
+            batch['negative_text']['input_ids'] = batch['negative_text']['input_ids'].to(
+                self.device_map['text_encoder'])
+            batch['negative_text']['attention_mask'] = batch['negative_text']['attention_mask'].to(
+                self.device_map['text_encoder'])
+        return batch
+    def __call__(
+            self,
+            text: str,
+            image: PIL.Image.Image,
+            mask: np.ndarray,
+            negative_text: str = None,
+            images_num: int = 1,
+            bs: int = 1,
+            steps: int = 50,
+            guidance_weight_text: float = 4,
+            eta=1.0
+    ) -> List[PIL.Image.Image]:
+        with torch.no_grad():
+            batch = self.prepare_batch(text, negative_text, image, mask)
+            processed = self.shared_step(batch)
+            betas = get_named_beta_schedule('cosine', 1000)
+            base_diffusion = BaseDiffusion(betas, percentile=0.95)
+            times = list(range(999, 0, -1000 // steps))
+            pil_images = []
+            k, m = images_num // bs, images_num % bs
+            for minibatch in [bs] * k + [m]:
+                if minibatch == 0:
+                    continue
+                bs_context = repeat(processed['context'], '1 n d -> b n d', b=minibatch)
+                bs_context_mask = repeat(processed['context_mask'], '1 n -> b n', b=minibatch)
+                if processed['negative_context'] is not None:
+                    bs_negative_context = repeat(processed['negative_context'], '1 n d -> b n d', b=minibatch)
+                    bs_negative_context_mask = repeat(processed['negative_context_mask'], '1 n -> b n', b=minibatch)
+                else:
+                    bs_negative_context, bs_negative_context_mask = None, None
+                mask = processed['mask'].repeat_interleave(minibatch, dim=0)
+                masked_latent = processed['masked_latent'].repeat_interleave(minibatch, dim=0)
+                minibatch = masked_latent.shape[0]
+                with torch.cuda.amp.autocast(dtype=self.dtype_map['unet']):
+                    with torch.no_grad():
+                        images = base_diffusion.p_sample_loop(
+                            self.unet, (minibatch, 4, masked_latent.shape[2], masked_latent.shape[3]), times,
+                            self.device_map['unet'],
+                            bs_context, bs_context_mask, self.null_embedding, guidance_weight_text, eta,
+                            negative_context=bs_negative_context, negative_context_mask=bs_negative_context_mask,
+                            mask=mask, masked_latent=masked_latent, gan=False
+                        )
+                with torch.cuda.amp.autocast(dtype=self.dtype_map['movq']):
+                    images = torch.cat([self.movq.decode(image) for image in images.chunk(2)])
+                    images = torch.clip((images + 1.) / 2., 0., 1.).cpu()
+                for images_chunk in images.chunk(1):
+                    pil_images += [self.to_pil(image) for image in images_chunk]
+        return pil_images

kandinsky3/.ipynb_checkpoints/movq-checkpoint.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import math
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+from .utils import freeze
+def nonlinearity(x):
+    return x*torch.sigmoid(x)
+class SpatialNorm(nn.Module):
+    def __init__(
+        self, f_channels, zq_channels=None, norm_layer=nn.GroupNorm, freeze_norm_layer=False, add_conv=False, **norm_layer_params
+    ):
+        super().__init__()
+        self.norm_layer = norm_layer(num_channels=f_channels, **norm_layer_params)
+        if zq_channels is not None:
+            if freeze_norm_layer:
+                for p in self.norm_layer.parameters:
+                    p.requires_grad = False
+            self.add_conv = add_conv
+            if self.add_conv:
+                self.conv = nn.Conv2d(zq_channels, zq_channels, kernel_size=3, stride=1, padding=1)
+            self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+            self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, f, zq=None):
+        norm_f = self.norm_layer(f)
+        if zq is not None:
+            f_size = f.shape[-2:]
+            zq = torch.nn.functional.interpolate(zq, size=f_size, mode="nearest")
+            if self.add_conv:
+                zq = self.conv(zq)
+            norm_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
+        return norm_f
+def Normalize(in_channels, zq_ch=None, add_conv=None):
+    return SpatialNorm(
+            in_channels, zq_ch, norm_layer=nn.GroupNorm,
+            freeze_norm_layer=False, add_conv=add_conv, num_groups=32, eps=1e-6, affine=True
+        )
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512, zq_ch=None, add_conv=False):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels, zq_ch, add_conv=add_conv)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = Normalize(out_channels, zq_ch, add_conv=add_conv)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb, zq=None):
+        h = x
+        h = self.norm1(h, zq)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+        h = self.norm2(h, zq)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels, zq_ch=None, add_conv=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels, zq_ch, add_conv=add_conv)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x, zq=None):
+        h_ = x
+        h_ = self.norm(h_, zq)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x+h_
+class Encoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, **ignore_kwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, give_pre_end=False, zq_ch=None, add_conv=False, **ignorekwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,)+tuple(ch_mult)
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        self.z_shape = (1,z_channels,curr_res,curr_res)
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout,
+                                       zq_ch=zq_ch,
+                                       add_conv=add_conv)
+        self.mid.attn_1 = AttnBlock(block_in, zq_ch, add_conv=add_conv)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout,
+                                       zq_ch=zq_ch,
+                                       add_conv=add_conv)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout,
+                                         zq_ch=zq_ch,
+                                         add_conv=add_conv))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in, zq_ch, add_conv=add_conv))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in, zq_ch, add_conv=add_conv)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z, zq):
+        #assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb, zq)
+        h = self.mid.attn_1(h, zq)
+        h = self.mid.block_2(h, temb, zq)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h, temb, zq)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h, zq)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h, zq)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class MoVQ(nn.Module):
+    def __init__(self, generator_params):
+        super().__init__()
+        z_channels = generator_params["z_channels"]
+        self.encoder = Encoder(**generator_params)
+        self.quant_conv = torch.nn.Conv2d(z_channels, z_channels, 1)
+        self.post_quant_conv = torch.nn.Conv2d(z_channels, z_channels, 1)
+        self.decoder = Decoder(zq_ch=z_channels, **generator_params)
+    # @torch.no_grad()
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+    # @torch.no_grad()
+    def decode(self, quant):
+        decoder_input = self.post_quant_conv(quant)
+        decoded = self.decoder(decoder_input, quant)
+        return decoded
+def get_vae(conf):
+    movq = MoVQ(conf.params)
+    if conf.checkpoint is not None:
+        movq_state_dict = torch.load(conf.checkpoint)
+        movq.load_state_dict(movq_state_dict)
+    movq = freeze(movq)
+    return movq

kandinsky3/.ipynb_checkpoints/t2i_pipeline-checkpoint.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from typing import Union, List
+import PIL
+import torch
+import torchvision.transforms as T
+from einops import repeat
+from kandinsky3.model.unet import UNet
+from kandinsky3.movq import MoVQ
+from kandinsky3.condition_encoders import T5TextConditionEncoder
+from kandinsky3.condition_processors import T5TextConditionProcessor
+from kandinsky3.model.diffusion import BaseDiffusion, get_named_beta_schedule
+class Kandinsky3T2IPipeline:
+    def __init__(
+            self,
+            device_map: Union[str, torch.device, dict],
+            dtype_map: Union[str, torch.dtype, dict],
+            unet: UNet,
+            null_embedding: torch.Tensor,
+            t5_processor: T5TextConditionProcessor,
+            t5_encoder: T5TextConditionEncoder,
+            movq: MoVQ,
+            gan: bool,
+    ):
+        self.device_map = device_map
+        self.dtype_map = dtype_map
+        self.to_pil = T.ToPILImage()
+        self.unet = unet
+        self.null_embedding = null_embedding
+        self.t5_processor = t5_processor
+        self.t5_encoder = t5_encoder
+        self.movq = movq
+        self.gan = gan
+    def __call__(
+            self,
+            text: str,
+            negative_text: str = None,
+            images_num: int = 1,
+            bs: int = 1,
+            width: int = 1024,
+            height: int = 1024,
+            guidance_scale: float = 3.0,
+            steps: int = 50,
+            eta: float = 1.0
+    ) -> List[PIL.Image.Image]:
+        betas = get_named_beta_schedule('cosine', 1000)
+        base_diffusion = BaseDiffusion(betas, 0.99)
+        times = list(range(999, 0, -1000 // steps))
+        if self.gan:
+            times = list(range(979, 0, -250))
+        condition_model_input, negative_condition_model_input = self.t5_processor.encode(text, negative_text)
+        for input_type in condition_model_input:
+            condition_model_input[input_type] = condition_model_input[input_type][None].to(
+                self.device_map['text_encoder']
+            )
+        if negative_condition_model_input is not None:
+            for input_type in negative_condition_model_input:
+                negative_condition_model_input[input_type] = negative_condition_model_input[input_type][None].to(
+                    self.device_map['text_encoder']
+                )
+        pil_images = []
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(dtype=self.dtype_map['text_encoder']):
+                context, context_mask = self.t5_encoder(condition_model_input)
+                if negative_condition_model_input is not None:
+                    negative_context, negative_context_mask = self.t5_encoder(negative_condition_model_input)
+                else:
+                    negative_context, negative_context_mask = None, None
+            k, m = images_num // bs, images_num % bs
+            for minibatch in [bs] * k + [m]:
+                if minibatch == 0:
+                    continue
+                bs_context = repeat(context, '1 n d -> b n d', b=minibatch)
+                bs_context_mask = repeat(context_mask, '1 n -> b n', b=minibatch)
+                if negative_context is not None:
+                    bs_negative_context = repeat(negative_context, '1 n d -> b n d', b=minibatch)
+                    bs_negative_context_mask = repeat(negative_context_mask, '1 n -> b n', b=minibatch)
+                else:
+                    bs_negative_context, bs_negative_context_mask = None, None
+                with torch.cuda.amp.autocast(dtype=self.dtype_map['unet']):
+                    images = base_diffusion.p_sample_loop(
+                        self.unet, (minibatch, 4, height // 8, width // 8), times, self.device_map['unet'],
+                        bs_context, bs_context_mask, self.null_embedding, guidance_scale, eta,
+                        negative_context=bs_negative_context, negative_context_mask=bs_negative_context_mask,
+                        gan=self.gan
+                    )
+                with torch.cuda.amp.autocast(dtype=self.dtype_map['movq']):
+                    images = torch.cat([self.movq.decode(image) for image in images.chunk(2)])
+                    # print(torch.max(images), torch.min(images))
+                    images = torch.clip((images + 1.) / 2., 0., 1.)
+                    # print(torch.max(images), torch.min(images))
+                    # raise
+                    for images_chunk in images.chunk(1):
+                        pil_images += [self.to_pil(image) for image in images_chunk]
+        return pil_images

kandinsky3/.ipynb_checkpoints/utils-checkpoint.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from omegaconf import OmegaConf
+import numpy as np
+from scipy import ndimage
+import torch.nn as nn
+from skimage.transform import resize
+def load_conf(config_path):
+    conf = OmegaConf.load(config_path)
+    conf.data.tokens_length = conf.common.tokens_length
+    conf.data.processor_names = conf.model.encoders.model_names
+    conf.data.dataset.seed = conf.common.seed
+    conf.data.dataset.image_size = conf.common.image_size
+    conf.trainer.trainer_params.max_steps = conf.common.train_steps
+    conf.scheduler.params.total_steps = conf.common.train_steps
+    conf.logger.tensorboard.name = conf.common.experiment_name
+    conf.model.encoders.context_dim = conf.model.unet_params.context_dim
+    return conf
+def freeze(model):
+    for p in model.parameters():
+        p.requires_grad = False
+    return model
+def unfreeze(model):
+    for p in model.parameters():
+        p.requires_grad = True
+    return model
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
+def resize_mask_for_diffusion(mask):
+    reduce_factor = max(1, (mask.size / 1024**2)**0.5)
+    resized_mask = resize(
+        mask,
+        (
+            (round(mask.shape[0] / reduce_factor) // 64) * 64,
+            (round(mask.shape[1] / reduce_factor) // 64) * 64
+        ),
+        preserve_range=True,
+        anti_aliasing=False
+    )
+    return resized_mask
+def resize_image_for_diffusion(image):
+    reduce_factor = max(1, (image.size[0] * image.size[1] / 1024**2)**0.5)
+    image = image.resize((
+        (round(image.size[0] / reduce_factor) // 64) * 64, (round(image.size[1] / reduce_factor) // 64) * 64
+    ))
+    return image
+def prepare_mask(mask):
+    ker = np.array([[1, 1,  1, 1, 1],
+        [1, 5,  5, 5, 1],
+        [1, 5, 44, 5, 1],
+        [1, 5,  5, 5, 1],
+        [1, 1,  1, 1, 1]]) / 100
+    out = ndimage.convolve(mask, ker)
+    out = ndimage.convolve(out, ker)
+    out = ndimage.convolve(out, ker)
+    mask = (out > 0).astype(int)
+    return mask

kandinsky3/__init__.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import os
+from typing import Optional, Union
+import torch
+from huggingface_hub import hf_hub_download, snapshot_download
+from kandinsky3.model.unet import UNet
+from kandinsky3.movq import MoVQ
+from kandinsky3.condition_encoders import T5TextConditionEncoder
+from kandinsky3.condition_processors import T5TextConditionProcessor
+from kandinsky3.model.diffusion import BaseDiffusion, get_named_beta_schedule
+from .t2i_pipeline import Kandinsky3T2IPipeline
+from .inpainting_pipeline import Kandinsky3InpaintingPipeline
+def get_T2I_unet(
+        device: Union[str, torch.device],
+        weights_path: Optional[str] = None,
+        dtype: Union[str, torch.dtype] = torch.float32,
+) -> (UNet, Optional[torch.Tensor], Optional[dict]):
+    unet = UNet(
+        model_channels=384,
+        num_channels=4,
+        init_channels=192,
+        time_embed_dim=1536,
+        context_dim=4096,
+        groups=32,
+        head_dim=64,
+        expansion_ratio=4,
+        compression_ratio=2,
+        dim_mult=(1, 2, 4, 8),
+        num_blocks=(3, 3, 3, 3),
+        add_cross_attention=(False, True, True, True),
+        add_self_attention=(False, True, True, True),
+    )
+    null_embedding = None
+    if weights_path:
+        state_dict = torch.load(weights_path, map_location=torch.device('cpu'))
+        null_embedding = state_dict['null_embedding']
+        unet.load_state_dict(state_dict['unet'])
+    unet.to(device=device, dtype=dtype).eval()
+    return unet, null_embedding
+def get_T5encoder(
+        device: Union[str, torch.device],
+        weights_path: str,
+        projection_name: str,
+        dtype: Union[str, torch.dtype] = torch.float32,
+        low_cpu_mem_usage: bool = True,
+        load_in_8bit: bool = False,
+        load_in_4bit: bool = False,
+) -> (T5TextConditionProcessor, T5TextConditionEncoder):
+    tokens_length = 128
+    context_dim = 4096
+    processor = T5TextConditionProcessor(tokens_length, weights_path)
+    condition_encoder = T5TextConditionEncoder(
+        weights_path, context_dim, low_cpu_mem_usage=low_cpu_mem_usage, device=device,
+        dtype=dtype, load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit
+    )
+    if weights_path:
+        projections_weights_path = os.path.join(weights_path, projection_name)
+        state_dict = torch.load(projections_weights_path, map_location=torch.device('cpu'))
+        condition_encoder.projection.load_state_dict(state_dict)
+    condition_encoder.projection.to(device=device, dtype=dtype).eval()
+    return processor, condition_encoder
+def get_movq(
+        device: Union[str, torch.device],
+        weights_path: Optional[str] = None,
+        dtype: Union[str, torch.dtype] = torch.float32,
+) -> MoVQ:
+    generator_config = {
+        'double_z': False,
+        'z_channels': 4,
+        'resolution': 256,
+        'in_channels': 3,
+        'out_ch': 3,
+        'ch': 256,
+        'ch_mult': [1, 2, 2, 4],
+        'num_res_blocks': 2,
+        'attn_resolutions': [32],
+        'dropout': 0.0
+    }
+    movq = MoVQ(generator_config)
+    if weights_path:
+        state_dict = torch.load(weights_path, map_location=torch.device('cpu'))
+        movq.load_state_dict(state_dict)
+    movq.to(device=device, dtype=dtype).eval()
+    return movq
+def get_inpainting_unet(
+        device: Union[str, torch.device],
+        weights_path: Optional[str] = None,
+        dtype: Union[str, torch.dtype] = torch.float32,
+) -> (UNet, Optional[torch.Tensor], Optional[dict]):
+    unet = UNet(
+        model_channels=384,
+        num_channels=9,
+        init_channels=192,
+        time_embed_dim=1536,
+        context_dim=4096,
+        groups=32,
+        head_dim=64,
+        expansion_ratio=4,
+        compression_ratio=2,
+        dim_mult=(1, 2, 4, 8),
+        num_blocks=(3, 3, 3, 3),
+        add_cross_attention=(False, True, True, True),
+        add_self_attention=(False, True, True, True),
+    )
+    null_embedding = None
+    if weights_path:
+        state_dict = torch.load(weights_path, map_location=torch.device('cpu'))
+        null_embedding = state_dict['null_embedding']
+        unet.load_state_dict(state_dict['unet'])
+    unet.to(device=device, dtype=dtype).eval()
+    return unet, null_embedding
+def get_T2I_pipeline(
+        device_map: Union[str, torch.device, dict],
+        dtype_map: Union[str, torch.dtype, dict] = torch.float32,
+        low_cpu_mem_usage: bool = True,
+        load_in_8bit: bool = False,
+        load_in_4bit: bool = False,
+        cache_dir: str = '/tmp/kandinsky3/',
+        unet_path: str = None,
+        text_encoder_path: str = None,
+        movq_path: str = None,
+) -> Kandinsky3T2IPipeline:
+    # assert ((unet_path is not None) or (text_encoder_path is not None) or (movq_path is not None))
+    if not isinstance(device_map, dict):
+        device_map = {
+            'unet': device_map, 'text_encoder': device_map, 'movq': device_map
+        }
+    if not isinstance(dtype_map, dict):
+        dtype_map = {
+            'unet': dtype_map, 'text_encoder': dtype_map, 'movq': dtype_map
+        }
+    if unet_path is None:
+        unet_path = hf_hub_download(
+            repo_id="ai-forever/Kandinsky3.1", filename='weights/kandinsky3.pt', cache_dir=cache_dir
+        )
+    if text_encoder_path is None:
+        text_encoder_path = snapshot_download(
+            repo_id="ai-forever/Kandinsky3.1", allow_patterns="weights/flan_ul2_encoder/*", cache_dir=cache_dir
+        )
+        text_encoder_path = os.path.join(text_encoder_path, 'weights/flan_ul2_encoder')
+    if movq_path is None:
+        movq_path = hf_hub_download(
+            repo_id="ai-forever/Kandinsky3.1", filename='weights/movq.pt', cache_dir=cache_dir
+        )
+    unet, null_embedding = get_T2I_unet(device_map['unet'], unet_path, dtype=dtype_map['unet'])
+    processor, condition_encoder = get_T5encoder(
+        device_map['text_encoder'], text_encoder_path, 'projection.pt', dtype=dtype_map['text_encoder'],
+        low_cpu_mem_usage=low_cpu_mem_usage, load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit
+    )
+    movq = get_movq(device_map['movq'], movq_path, dtype=dtype_map['movq'])
+    return Kandinsky3T2IPipeline(
+        device_map, dtype_map, unet, null_embedding, processor, condition_encoder, movq, False
+    )
+def get_T2I_Flash_pipeline(
+        device_map: Union[str, torch.device, dict],
+        dtype_map: Union[str, torch.dtype, dict] = torch.float32,
+        low_cpu_mem_usage: bool = True,
+        load_in_8bit: bool = False,
+        load_in_4bit: bool = False,
+        cache_dir: str = '/tmp/kandinsky3/',
+        unet_path: str = None,
+        text_encoder_path: str = None,
+        movq_path: str = None,
+) -> Kandinsky3T2IPipeline:
+    # assert ((unet_path is not None) or (text_encoder_path is not None) or (movq_path is not None))
+    if not isinstance(device_map, dict):
+        device_map = {
+            'unet': device_map, 'text_encoder': device_map, 'movq': device_map
+        }
+    if not isinstance(dtype_map, dict):
+        dtype_map = {
+            'unet': dtype_map, 'text_encoder': dtype_map, 'movq': dtype_map
+        }
+    if unet_path is None:
+        unet_path = hf_hub_download(
+            repo_id="ai-forever/Kandinsky3.1", filename='weights/kandinsky3_flash.pt', cache_dir=cache_dir
+        )
+    if text_encoder_path is None:
+        text_encoder_path = snapshot_download(
+            repo_id="ai-forever/Kandinsky3.1", allow_patterns="weights/flan_ul2_encoder/*", cache_dir=cache_dir
+        )
+        text_encoder_path = os.path.join(text_encoder_path, 'weights/flan_ul2_encoder')
+    if movq_path is None:
+        movq_path = hf_hub_download(
+            repo_id="ai-forever/Kandinsky3.1", filename='weights/movq.pt', cache_dir=cache_dir
+        )
+    unet, null_embedding = get_T2I_unet(device_map['unet'], unet_path, dtype=dtype_map['unet'])
+    processor, condition_encoder = get_T5encoder(
+        device_map['text_encoder'], text_encoder_path, 'projection_flash.pt', dtype=dtype_map['text_encoder'],
+        low_cpu_mem_usage=low_cpu_mem_usage, load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit
+    )
+    movq = get_movq(device_map['movq'], movq_path, dtype=dtype_map['movq'])
+    return Kandinsky3T2IPipeline(
+        device_map, dtype_map, unet, null_embedding, processor, condition_encoder, movq, True
+    )
+def get_inpainting_pipeline(
+        device_map: Union[str, torch.device, dict],
+        dtype_map: Union[str, torch.dtype, dict] = torch.float32,
+        low_cpu_mem_usage: bool = True,
+        load_in_8bit: bool = False,
+        load_in_4bit: bool = False,
+        cache_dir: str = '/tmp/kandinsky3/',
+        unet_path: str = None,
+        text_encoder_path: str = None,
+        movq_path: str = None,
+) -> Kandinsky3InpaintingPipeline:
+    # assert ((unet_path is not None) or (text_encoder_path is not None) or (movq_path is not None))
+    if not isinstance(device_map, dict):
+        device_map = {
+            'unet': device_map, 'text_encoder': device_map, 'movq': device_map
+        }
+    if not isinstance(dtype_map, dict):
+        dtype_map = {
+            'unet': dtype_map, 'text_encoder': dtype_map, 'movq': dtype_map
+        }
+    if unet_path is None:
+        unet_path = hf_hub_download(
+            repo_id="ai-forever/Kandinsky3.1", filename='weights/kandinsky3_inpainting.pt', cache_dir=cache_dir
+        )
+    if text_encoder_path is None:
+        text_encoder_path = snapshot_download(
+            repo_id="ai-forever/Kandinsky3.1", allow_patterns="weights/flan_ul2_encoder/*", cache_dir=cache_dir
+        )
+        text_encoder_path = os.path.join(text_encoder_path, 'weights/flan_ul2_encoder')
+    if movq_path is None:
+        movq_path = hf_hub_download(
+            repo_id="ai-forever/Kandinsky3.1", filename='weights/movq.pt', cache_dir=cache_dir
+        )
+    unet, null_embedding = get_inpainting_unet(device_map['unet'], unet_path, dtype=dtype_map['unet'])
+    processor, condition_encoder = get_T5encoder(
+        device_map['text_encoder'], text_encoder_path, 'projection_inpainting.pt', dtype=dtype_map['text_encoder'],
+        low_cpu_mem_usage=low_cpu_mem_usage, load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit
+    )
+    movq = get_movq(device_map['movq'], movq_path, dtype=dtype_map['movq'])
+    return Kandinsky3InpaintingPipeline(
+        device_map, dtype_map, unet, null_embedding, processor, condition_encoder, movq
+    )

kandinsky3/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (5.38 kB). View file

kandinsky3/__pycache__/condition_encoders.cpython-310.pyc ADDED Viewed

Binary file (1.84 kB). View file

kandinsky3/__pycache__/condition_processors.cpython-310.pyc ADDED Viewed

Binary file (1.46 kB). View file

kandinsky3/__pycache__/inpainting_pipeline.cpython-310.pyc ADDED Viewed

Binary file (5.32 kB). View file

kandinsky3/__pycache__/movq.cpython-310.pyc ADDED Viewed

Binary file (10 kB). View file

kandinsky3/__pycache__/t2i_pipeline.cpython-310.pyc ADDED Viewed

Binary file (3.63 kB). View file

kandinsky3/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (2.31 kB). View file

kandinsky3/condition_encoders.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from torch import nn
+from transformers import T5EncoderModel
+from typing import Optional, Union
+class T5TextConditionEncoder(nn.Module):
+    def __init__(
+            self, model_path, context_dim,
+            low_cpu_mem_usage: bool = True, device: Optional[str] = None,
+            dtype: Union[str, torch.dtype] = torch.float32, load_in_4bit: bool = False, load_in_8bit: bool = False
+    ):
+        super().__init__()
+        self.encoder = T5EncoderModel.from_pretrained(
+            model_path, low_cpu_mem_usage=low_cpu_mem_usage, device_map=device,
+            torch_dtype=dtype, load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit,
+        ).encoder
+        self.projection = nn.Sequential(
+            nn.Linear(self.encoder.config.d_model, context_dim, bias=False),
+            nn.LayerNorm(context_dim)
+        )
+    def forward(self, model_input):
+        embeddings = self.encoder(**model_input).last_hidden_state
+        context = self.projection(embeddings)
+        if 'attention_mask' in model_input:
+            context_mask = model_input['attention_mask']
+            context[context_mask == 0] = torch.zeros_like(context[context_mask == 0])
+            max_seq_length = context_mask.sum(-1).max() + 1
+            context = context[:, :max_seq_length]
+            context_mask = context_mask[:, :max_seq_length]
+        else:
+            context_mask = torch.ones(*embeddings.shape[:-1], dtype=torch.long, device=embeddings.device)
+        return context, context_mask
+def get_condition_encoder(conf):
+    return T5TextConditionEncoder(**conf)

kandinsky3/condition_processors.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+from transformers import T5Tokenizer
+class T5TextConditionProcessor:
+    def __init__(self, tokens_length, processor_path):
+        self.tokens_length = tokens_length
+        self.processor = T5Tokenizer.from_pretrained(processor_path)
+    def encode(self, text=None, negative_text=None):
+        encoded = self.processor(text, max_length=self.tokens_length, truncation=True)
+        pad_length = self.tokens_length - len(encoded['input_ids'])
+        input_ids = encoded['input_ids'] + [self.processor.pad_token_id] * pad_length
+        attention_mask = encoded['attention_mask'] + [0] * pad_length
+        condition_model_input = {
+            'input_ids': torch.tensor(input_ids, dtype=torch.long),
+            'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
+        }
+        if negative_text is not None:
+            negative_encoded = self.processor(negative_text, max_length=self.tokens_length, truncation=True)
+            negative_input_ids = negative_encoded['input_ids'][:len(encoded['input_ids'])]
+            negative_input_ids[-1] = self.processor.eos_token_id
+            negative_pad_length = self.tokens_length - len(negative_input_ids)
+            negative_input_ids = negative_input_ids + [self.processor.pad_token_id] * negative_pad_length
+            negative_attention_mask = encoded['attention_mask'] + [0] * pad_length
+            negative_condition_model_input = {
+                'input_ids': torch.tensor(negative_input_ids, dtype=torch.long),
+                'attention_mask': torch.tensor(negative_attention_mask, dtype=torch.long)
+            }
+        else:
+            negative_condition_model_input = None
+        return condition_model_input, negative_condition_model_input

kandinsky3/inpainting_pipeline.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from typing import Union, List
+import PIL
+import numpy as np
+import torch
+import torchvision.transforms as T
+from einops import repeat
+from kandinsky3.model.unet import UNet
+from kandinsky3.movq import MoVQ
+from kandinsky3.condition_encoders import T5TextConditionEncoder
+from kandinsky3.condition_processors import T5TextConditionProcessor
+from kandinsky3.model.diffusion import BaseDiffusion, get_named_beta_schedule
+from kandinsky3.utils import resize_image_for_diffusion, resize_mask_for_diffusion
+class Kandinsky3InpaintingPipeline:
+    def __init__(
+            self,
+            device_map: Union[str, torch.device, dict],
+            dtype_map: Union[str, torch.dtype, dict],
+            unet: UNet,
+            null_embedding: torch.Tensor,
+            t5_processor: T5TextConditionProcessor,
+            t5_encoder: T5TextConditionEncoder,
+            movq: MoVQ,
+    ):
+        self.device_map = device_map
+        self.dtype_map = dtype_map
+        self.to_pil = T.ToPILImage()
+        self.to_tensor = T.ToTensor()
+        self.unet = unet
+        self.null_embedding = null_embedding
+        self.t5_processor = t5_processor
+        self.t5_encoder = t5_encoder
+        self.movq = movq
+    def shared_step(self, batch: dict) -> dict:
+        image = batch['image']
+        condition_model_input = batch['text']
+        negative_condition_model_input = batch['negative_text']
+        bs = image.shape[0]
+        masked_latent = None
+        mask = batch['mask']
+        if 'masked_image' in batch:
+            masked_latent = batch['masked_image']
+        elif self.unet.in_layer.in_channels == 9:
+            masked_latent = image.masked_fill((1 - mask).bool(), 0)
+        else:
+            raise ValueError()
+        with torch.cuda.amp.autocast(dtype=self.dtype_map['movq']):
+            masked_latent = self.movq.encode(masked_latent)
+        mask = torch.nn.functional.interpolate(mask, size=(masked_latent.shape[2], masked_latent.shape[3]))
+        with torch.cuda.amp.autocast(dtype=self.dtype_map['text_encoder']):
+            context, context_mask = self.t5_encoder(condition_model_input)
+        if negative_condition_model_input is not None:
+            negative_context, negative_context_mask = self.t5_encoder(negative_condition_model_input)
+        else:
+            negative_context, negative_context_mask = None, None
+        return {
+            'context': context,
+            'context_mask': context_mask,
+            'negative_context': negative_context,
+            'negative_context_mask': negative_context_mask,
+            'image': image,
+            'masked_latent': masked_latent,
+            'mask': mask
+        }
+    def prepare_batch(
+            self,
+            text: str,
+            negative_text: str,
+            image: PIL.Image.Image,
+            mask: np.ndarray,
+    ) -> dict:
+        condition_model_input, negative_condition_model_input = self.t5_processor.encode(
+            text=text, negative_text=negative_text
+        )
+        batch = {
+            'image': self.to_tensor(resize_image_for_diffusion(image.convert("RGB"))) * 2 - 1,
+            'mask': 1 - self.to_tensor(resize_mask_for_diffusion(mask)),
+            'text': condition_model_input,
+            'negative_text': negative_condition_model_input
+        }
+        batch['mask'] = batch['mask'].type(self.dtype_map['movq'])
+        batch['image'] = batch['image'].unsqueeze(0).to(self.device_map['movq'])
+        batch['text']['input_ids'] = batch['text']['input_ids'].unsqueeze(0).to(self.device_map['text_encoder'])
+        batch['text']['attention_mask'] = batch['text']['attention_mask'].unsqueeze(0).to(
+            self.device_map['text_encoder'])
+        batch['mask'] = batch['mask'].unsqueeze(0).to(self.device_map['movq'])
+        if negative_condition_model_input is not None:
+            batch['negative_text']['input_ids'] = batch['negative_text']['input_ids'].to(
+                self.device_map['text_encoder'])
+            batch['negative_text']['attention_mask'] = batch['negative_text']['attention_mask'].to(
+                self.device_map['text_encoder'])
+        return batch
+    def __call__(
+            self,
+            text: str,
+            image: PIL.Image.Image,
+            mask: np.ndarray,
+            negative_text: str = None,
+            images_num: int = 1,
+            bs: int = 1,
+            steps: int = 50,
+            guidance_weight_text: float = 4,
+            eta=1.0
+    ) -> List[PIL.Image.Image]:
+        with torch.no_grad():
+            batch = self.prepare_batch(text, negative_text, image, mask)
+            processed = self.shared_step(batch)
+            betas = get_named_beta_schedule('cosine', 1000)
+            base_diffusion = BaseDiffusion(betas, percentile=0.95)
+            times = list(range(999, 0, -1000 // steps))
+            pil_images = []
+            k, m = images_num // bs, images_num % bs
+            for minibatch in [bs] * k + [m]:
+                if minibatch == 0:
+                    continue
+                bs_context = repeat(processed['context'], '1 n d -> b n d', b=minibatch)
+                bs_context_mask = repeat(processed['context_mask'], '1 n -> b n', b=minibatch)
+                if processed['negative_context'] is not None:
+                    bs_negative_context = repeat(processed['negative_context'], '1 n d -> b n d', b=minibatch)
+                    bs_negative_context_mask = repeat(processed['negative_context_mask'], '1 n -> b n', b=minibatch)
+                else:
+                    bs_negative_context, bs_negative_context_mask = None, None
+                mask = processed['mask'].repeat_interleave(minibatch, dim=0)
+                masked_latent = processed['masked_latent'].repeat_interleave(minibatch, dim=0)
+                minibatch = masked_latent.shape[0]
+                with torch.cuda.amp.autocast(dtype=self.dtype_map['unet']):
+                    with torch.no_grad():
+                        images = base_diffusion.p_sample_loop(
+                            self.unet, (minibatch, 4, masked_latent.shape[2], masked_latent.shape[3]), times,
+                            self.device_map['unet'],
+                            bs_context, bs_context_mask, self.null_embedding, guidance_weight_text, eta,
+                            negative_context=bs_negative_context, negative_context_mask=bs_negative_context_mask,
+                            mask=mask, masked_latent=masked_latent, gan=False
+                        )
+                with torch.cuda.amp.autocast(dtype=self.dtype_map['movq']):
+                    images = torch.cat([self.movq.decode(image) for image in images.chunk(2)])
+                    images = torch.clip((images + 1.) / 2., 0., 1.).cpu()
+                for images_chunk in images.chunk(1):
+                    pil_images += [self.to_pil(image) for image in images_chunk]
+        return pil_images

kandinsky3/model/.ipynb_checkpoints/diffusion-checkpoint.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import math
+import torch
+from einops import rearrange
+from tqdm import tqdm
+from .utils import get_tensor_items
+def get_named_beta_schedule(schedule_name, timesteps):
+    if schedule_name == "linear":
+        scale = 1000 / timesteps
+        beta_start = scale * 0.0001
+        beta_end = scale * 0.02
+        return torch.linspace(
+            beta_start, beta_end, timesteps, dtype=torch.float32
+        )
+    elif schedule_name == "cosine":
+        alpha_bar = lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+        betas = []
+        for i in range(timesteps):
+            t1 = i / timesteps
+            t2 = (i + 1) / timesteps
+            betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), 0.999))
+        return torch.tensor(betas, dtype=torch.float32)
+class BaseDiffusion:
+    def __init__(self, betas, percentile=None, gen_noise=torch.randn_like):
+        self.betas = betas
+        self.num_timesteps = betas.shape[0]
+        alphas = 1. - betas
+        self.alphas_cumprod = torch.cumprod(alphas, dim=0)
+        self.alphas_cumprod_prev = torch.cat([torch.ones(1, dtype=betas.dtype), self.alphas_cumprod[:-1]])
+        # calculate q(x_t | x_{t-1})
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1. - self.alphas_cumprod)
+        # calculate q(x_{t-1} | x_t, x_0)
+        self.posterior_mean_coef_1 = torch.sqrt(self.alphas_cumprod_prev) * betas / (1. - self.alphas_cumprod)
+        self.posterior_mean_coef_2 = torch.sqrt(alphas) * (1. - self.alphas_cumprod_prev) / (1. - self.alphas_cumprod)
+        self.posterior_variance = betas * (1. - self.alphas_cumprod_prev) / (1. - self.alphas_cumprod)
+        self.posterior_log_variance = torch.log(
+            torch.cat([self.posterior_variance[1].unsqueeze(0), self.posterior_variance[1:]])
+        )
+        self.percentile = percentile
+        self.time_scale = 1000 // self.num_timesteps
+        self.gen_noise = gen_noise
+        self.jump_length = 3
+    def process_x_start(self, x_start):
+        bs, ndims = x_start.shape[0], len(x_start.shape[1:])
+        if self.percentile is not None:
+            quantile = torch.quantile(
+                rearrange(x_start, 'b ... -> b (...)').abs(),
+                self.percentile,
+                dim=-1
+            )
+            quantile = torch.clip(quantile, min=1.)
+            quantile = quantile.reshape(bs, *((1,) * ndims))
+            return torch.clip(x_start, -quantile, quantile) / quantile
+        else:
+            return torch.clip(x_start, -1., 1.)
+    def get_x_start(self, x, t, noise):
+        sqrt_one_minus_alphas_cumprod = get_tensor_items(self.sqrt_one_minus_alphas_cumprod, t, noise.shape)
+        sqrt_alphas_cumprod = get_tensor_items(self.sqrt_alphas_cumprod, t, noise.shape)
+        pred_x_start = (x - sqrt_one_minus_alphas_cumprod * noise) / sqrt_alphas_cumprod
+        return pred_x_start
+    def get_noise(self, x, t, x_start):
+        sqrt_one_minus_alphas_cumprod = get_tensor_items(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
+        sqrt_alphas_cumprod = get_tensor_items(self.sqrt_alphas_cumprod, t, x_start.shape)
+        pred_noise = (x - sqrt_alphas_cumprod * x_start) / sqrt_one_minus_alphas_cumprod
+        return pred_noise
+    def q_sample(self, x_start, t, noise=None):
+        if noise is None:
+            noise = self.gen_noise(x_start)
+        sqrt_alphas_cumprod = get_tensor_items(self.sqrt_alphas_cumprod, t, x_start.shape)
+        sqrt_one_minus_alphas_cumprod = get_tensor_items(self.sqrt_one_minus_alphas_cumprod, t, noise.shape)
+        x_t = sqrt_alphas_cumprod * x_start + sqrt_one_minus_alphas_cumprod * noise
+        return x_t
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        posterior_mean_coef_1 = get_tensor_items(self.posterior_mean_coef_1, t, x_start.shape)
+        posterior_mean_coef_2 = get_tensor_items(self.posterior_mean_coef_2, t, x_t.shape)
+        posterior_mean = posterior_mean_coef_1 * x_start + posterior_mean_coef_2 * x_t
+        posterior_variance = get_tensor_items(self.posterior_variance, t, x_start.shape)
+        posterior_log_variance = get_tensor_items(self.posterior_log_variance, t, x_start.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance
+    def q_posterior_variance(self, t, prev_t, shape, eta=1., ):
+        alphas_cumprod = get_tensor_items(self.alphas_cumprod, t, shape)
+        prev_alphas_cumprod = get_tensor_items(self.alphas_cumprod, prev_t, shape)
+        posterior_variance = torch.sqrt(
+            eta * (1. - alphas_cumprod / prev_alphas_cumprod) * (1. - prev_alphas_cumprod) / (1. - alphas_cumprod)
+        )
+        return posterior_variance
+    def text_guidance(
+            self, model, x, t, context, context_mask, null_embedding, guidance_weight_text,
+            uncondition_context=None, uncondition_context_mask=None, mask=None, masked_latent=None
+    ):
+        large_x = x.repeat(2, 1, 1, 1)
+        large_t = t.repeat(2).to(x.dtype)
+        if uncondition_context is None:
+            uncondition_context = torch.zeros_like(context)
+            uncondition_context_mask = torch.zeros_like(context_mask)
+            uncondition_context[:, 0] = null_embedding
+            uncondition_context_mask[:, 0] = 1
+        large_context = torch.cat([context, uncondition_context])
+        large_context_mask = torch.cat([context_mask, uncondition_context_mask])
+        if mask is not None:
+            mask = mask.repeat(2, 1, 1, 1)
+        if masked_latent is not None:
+            masked_latent = masked_latent.repeat(2, 1, 1, 1)
+        if model.in_layer.in_channels == 9:
+            large_x = torch.cat([large_x, mask, masked_latent], dim=1)
+        pred_large_noise = model(large_x, large_t * self.time_scale, large_context, large_context_mask.bool())
+        pred_noise, uncond_pred_noise = torch.chunk(pred_large_noise, 2)
+        pred_noise = (guidance_weight_text + 1.) * pred_noise - guidance_weight_text * uncond_pred_noise
+        return pred_noise
+    def p_mean_variance(
+            self, model, x, t, prev_t, context, context_mask, null_embedding, guidance_weight_text, eta=1.,
+            negative_context=None, negative_context_mask=None, mask=None, masked_latent=None
+    ):
+        pred_noise = self.text_guidance(
+            model, x, t, context, context_mask, null_embedding, guidance_weight_text,
+            negative_context, negative_context_mask, mask, masked_latent
+        )
+        pred_x_start = self.get_x_start(x, t, pred_noise)
+        pred_x_start = self.process_x_start(pred_x_start)
+        pred_noise = self.get_noise(x, t, pred_x_start)
+        pred_var = self.q_posterior_variance(t, prev_t, x.shape, eta)
+        prev_alphas_cumprod = get_tensor_items(self.alphas_cumprod, prev_t, x.shape)
+        pred_mean = torch.sqrt(prev_alphas_cumprod) * pred_x_start
+        pred_mean += torch.sqrt(1. - prev_alphas_cumprod - pred_var ** 2) * pred_noise
+        return pred_mean, pred_var
+    # @torch.no_grad()
+    def p_sample(
+            self, model, x, t, prev_t, context, context_mask, null_embedding, guidance_weight_text, eta=1.,
+            negative_context=None, negative_context_mask=None, mask=None, masked_latent=None
+    ):
+        bs = x.shape[0]
+        ndims = len(x.shape[1:])
+        pred_mean, pred_var = self.p_mean_variance(
+            model, x, t, prev_t, context, context_mask, null_embedding, guidance_weight_text, eta,
+            negative_context=negative_context, negative_context_mask=negative_context_mask,
+            mask=mask, masked_latent=masked_latent
+        )
+        noise = torch.randn_like(x)
+        mask = (prev_t != 0).reshape(bs, *((1,) * ndims))
+        sample = pred_mean + mask * pred_var * noise
+        return sample
+    # @torch.no_grad()
+    def p_sample_loop(
+            self, model, shape, times, device, context, context_mask, null_embedding, guidance_weight_text, eta=1.,
+            negative_context=None, negative_context_mask=None, mask=None, masked_latent=None, gan=False,
+    ):
+        img = torch.randn(*shape, device=device)
+        times = times + [0, ]
+        times = list(zip(times[:-1], times[1:]))
+        for time, prev_time in tqdm(times):
+            time = torch.tensor([time] * shape[0], device=device)
+            if gan:
+                x_t = self.q_sample(img, time)
+                pred_noise = model(x_t, time.type(x_t.dtype), context, context_mask.bool())
+                img = self.get_x_start(x_t, time, pred_noise)
+            else:
+                prev_time = torch.tensor([prev_time] * shape[0], device=device)
+                img = self.p_sample(
+                    model, img, time, prev_time, context, context_mask, null_embedding, guidance_weight_text, eta,
+                    negative_context=negative_context, negative_context_mask=negative_context_mask,
+                    mask=mask, masked_latent=masked_latent
+                )
+        return img
+def get_diffusion(conf):
+    betas = get_named_beta_schedule(**conf.schedule_params)
+    base_diffusion = BaseDiffusion(betas, **conf.diffusion_params)
+    return base_diffusion

kandinsky3/model/.ipynb_checkpoints/unet-checkpoint.py ADDED Viewed

	@@ -0,0 +1,516 @@

+import torch
+from torch import nn, einsum
+from einops import rearrange
+from .nn import Identity, Attention, SinusoidalPosEmb, ConditionalGroupNorm
+from .utils import exist, set_default_item, set_default_layer
+import torch.nn.functional as F
+class Block(nn.Module):
+    def __init__(self, in_channels, out_channels, time_embed_dim, kernel_size=3, norm_groups=32, up_resolution=None):
+        super().__init__()
+        self.group_norm = ConditionalGroupNorm(norm_groups, in_channels, time_embed_dim)
+        self.activation = nn.SiLU()
+        self.up_sample = set_default_layer(
+            exist(up_resolution) and up_resolution,
+            nn.ConvTranspose2d, (in_channels, in_channels), {'kernel_size': 2, 'stride': 2}
+        )
+        padding = set_default_item(kernel_size == 1, 0, 1)
+        self.projection = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding)
+        self.down_sample = set_default_layer(
+            exist(up_resolution) and not up_resolution,
+            nn.Conv2d, (out_channels, out_channels), {'kernel_size': 2, 'stride': 2}
+        )
+    def forward(self, x, time_embed):
+        x = self.group_norm(x, time_embed)
+        x = self.activation(x)
+        x = self.up_sample(x)
+        x = self.projection(x)
+        x = self.down_sample(x)
+        return x
+class ResNetBlock(nn.Module):
+    def __init__(
+            self, in_channels, out_channels, time_embed_dim, norm_groups=32, compression_ratio=2, up_resolutions=4*[None]
+    ):
+        super().__init__()
+        kernel_sizes = [1, 3, 3, 1]
+        hidden_channel = max(in_channels, out_channels) // compression_ratio
+        hidden_channels = [(in_channels, hidden_channel)] + [(hidden_channel, hidden_channel)] * 2 + [(hidden_channel, out_channels)]
+        self.resnet_blocks = nn.ModuleList([
+            Block(in_channel, out_channel, time_embed_dim, kernel_size, norm_groups, up_resolution)
+            for (in_channel, out_channel), kernel_size, up_resolution in zip(hidden_channels, kernel_sizes, up_resolutions)
+        ])
+        self.shortcut_up_sample = set_default_layer(
+            True in up_resolutions,
+            nn.ConvTranspose2d, (in_channels, in_channels), {'kernel_size': 2, 'stride': 2}
+        )
+        self.shortcut_projection = set_default_layer(
+            in_channels != out_channels,
+            nn.Conv2d, (in_channels, out_channels), {'kernel_size': 1}
+        )
+        self.shortcut_down_sample = set_default_layer(
+            False in up_resolutions,
+            nn.Conv2d, (out_channels, out_channels), {'kernel_size': 2, 'stride': 2}
+        )
+    def forward(self, x, time_embed):
+        out = x
+        for resnet_block in self.resnet_blocks:
+            out = resnet_block(out, time_embed)
+        x = self.shortcut_up_sample(x)
+        x = self.shortcut_projection(x)
+        x = self.shortcut_down_sample(x)
+        x = x + out
+        return x
+class AttentionPolling(nn.Module):
+    def __init__(self, num_channels, context_dim, head_dim=64):
+        super().__init__()
+        self.attention = Attention(context_dim, num_channels, context_dim, head_dim)
+    def forward(self, x, context, context_mask=None):
+        context = self.attention(context.mean(dim=1, keepdim=True), context, context_mask)
+        return x + context.squeeze(1)
+class AttentionBlock(nn.Module):
+    def __init__(self, num_channels, time_embed_dim, context_dim=None, norm_groups=32, head_dim=64, expansion_ratio=4):
+        super().__init__()
+        self.in_norm = ConditionalGroupNorm(norm_groups, num_channels, time_embed_dim)
+        self.attention = Attention(num_channels, num_channels, context_dim or num_channels, head_dim)
+        hidden_channels = expansion_ratio * num_channels
+        self.out_norm = ConditionalGroupNorm(norm_groups, num_channels, time_embed_dim)
+        self.feed_forward = nn.Sequential(
+            nn.Conv2d(num_channels, hidden_channels, kernel_size=1, bias=False),
+            nn.SiLU(),
+            nn.Conv2d(hidden_channels, num_channels, kernel_size=1, bias=False),
+        )
+    def forward(self, x, time_embed, context=None, context_mask=None):
+        height, width = x.shape[-2:]
+        out = self.in_norm(x, time_embed)
+        out = rearrange(out, 'b c h w -> b (h w) c', h=height, w=width)
+        context = set_default_item(exist(context), context, out)
+        out = self.attention(out, context, context_mask)
+        out = rearrange(out, 'b (h w) c -> b c h w', h=height, w=width)
+        x = x + out
+        out = self.out_norm(x, time_embed)
+        out = self.feed_forward(out)
+        x = x + out
+        return x
+class DownSampleBlock(nn.Module):
+    def __init__(
+            self, in_channels, out_channels, time_embed_dim, context_dim=None,
+            num_blocks=3, groups=32, head_dim=64, expansion_ratio=4, compression_ratio=2,
+            down_sample=True, self_attention=True
+    ):
+        super().__init__()
+        self.self_attention_block = set_default_layer(
+            self_attention,
+            AttentionBlock,
+            (in_channels, time_embed_dim, None, groups, head_dim, expansion_ratio),
+            layer_2=Identity
+        )
+        up_resolutions = [[None] * 4] * (num_blocks - 1) + [[None, None, set_default_item(down_sample, False), None]]
+        hidden_channels = [(in_channels, out_channels)] + [(out_channels, out_channels)] * (num_blocks - 1)
+        self.resnet_attn_blocks = nn.ModuleList([
+            nn.ModuleList([
+                ResNetBlock(in_channel, out_channel, time_embed_dim, groups, compression_ratio),
+                set_default_layer(
+                    exist(context_dim),
+                    AttentionBlock,
+                    (out_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio),
+                    layer_2=Identity
+                ),
+                ResNetBlock(out_channel, out_channel, time_embed_dim, groups, compression_ratio, up_resolution),
+            ]) for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions)
+        ])
+    def forward(self, x, time_embed, context=None, context_mask=None, control_net_residual=None):
+        x = self.self_attention_block(x, time_embed)
+        for in_resnet_block, attention, out_resnet_block in self.resnet_attn_blocks:
+            x = in_resnet_block(x, time_embed)
+            x = attention(x, time_embed, context, context_mask)
+            x = out_resnet_block(x, time_embed)
+        return x
+class UpSampleBlock(nn.Module):
+    def __init__(
+            self, in_channels, cat_dim, out_channels, time_embed_dim, context_dim=None,
+            num_blocks=3, groups=32, head_dim=64, expansion_ratio=4, compression_ratio=2,
+            up_sample=True, self_attention=True
+    ):
+        super().__init__()
+        up_resolutions = [[None, set_default_item(up_sample, True), None, None]] + [[None] * 4] * (num_blocks - 1)
+        hidden_channels = [(in_channels + cat_dim, in_channels)] + [(in_channels, in_channels)] * (num_blocks - 2) + [(in_channels, out_channels)]
+        self.resnet_attn_blocks = nn.ModuleList([
+            nn.ModuleList([
+                ResNetBlock(in_channel, in_channel, time_embed_dim, groups, compression_ratio, up_resolution),
+                set_default_layer(
+                    exist(context_dim),
+                    AttentionBlock,
+                    (in_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio),
+                    layer_2=Identity
+                ),
+                ResNetBlock(in_channel, out_channel, time_embed_dim, groups, compression_ratio),
+            ]) for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions)
+        ])
+        self.self_attention_block = set_default_layer(
+            self_attention,
+            AttentionBlock,
+            (out_channels, time_embed_dim, None, groups, head_dim, expansion_ratio),
+            layer_2=Identity
+        )
+    def forward(self, x, time_embed, context=None, context_mask=None):
+        for in_resnet_block, attention, out_resnet_block in self.resnet_attn_blocks:
+            x = in_resnet_block(x, time_embed)
+            x = attention(x, time_embed, context, context_mask)
+            x = out_resnet_block(x, time_embed)
+        x = self.self_attention_block(x, time_embed)
+        return x
+class ControlNetModel(nn.Module):
+    def __init__(self,
+                 model_channels,
+                 init_channels=None,
+                 num_channels=3,
+                 out_channels=4,
+                 time_embed_dim=None,
+                 context_dim=None,
+                 groups=32,
+                 head_dim=64,
+                 expansion_ratio=4,
+                 compression_ratio=2,
+                 dim_mult=(1, 2, 4, 8),
+                 num_blocks=(3, 3, 3, 3),
+                 add_cross_attention=(False, True, True, True),
+                 add_self_attention=(False, True, True, True)
+                 ):
+        super().__init__()
+        init_channels = init_channels or model_channels
+        self.to_time_embed = nn.Sequential(
+            SinusoidalPosEmb(init_channels),
+            nn.Linear(init_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim)
+        )
+        self.feature_pooling = AttentionPolling(time_embed_dim, context_dim, head_dim)
+        self.in_layer = nn.Conv2d(num_channels, init_channels, kernel_size=3, padding=1)
+        hidden_dims = [init_channels, *map(lambda mult: model_channels * mult, dim_mult)]
+        in_out_dims = list(zip(hidden_dims[:-1], hidden_dims[1:]))
+        text_dims = [set_default_item(is_exist, context_dim) for is_exist in add_cross_attention]
+        layer_params = [num_blocks, text_dims, add_self_attention]
+        rev_layer_params = map(reversed, layer_params)
+        cat_dims = []
+        self.num_levels = len(in_out_dims)
+        self.down_samples = nn.ModuleList([])
+        for level, ((in_dim, out_dim), res_block_num, text_dim, self_attention) in enumerate(zip(in_out_dims, *layer_params)):
+            down_sample = level != (self.num_levels - 1)
+            cat_dims.append(set_default_item(level != (self.num_levels - 1), out_dim, 0))
+            self.down_samples.append(
+                DownSampleBlock(
+                    in_dim, out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim, expansion_ratio,
+                    compression_ratio, down_sample, self_attention
+                )
+            )
+    def forward(self, x, time, context=None, context_mask=None):
+        time_embed = self.to_time_embed(time)
+        if exist(context):
+            time_embed = self.feature_pooling(time_embed, context, context_mask)
+        hidden_states = []
+        x = self.in_layer(x)
+        for level, down_sample in enumerate(self.down_samples):
+            x = down_sample(x, time_embed, context, context_mask)
+            if level != self.num_levels - 1:
+                hidden_states.append(x)
+        return hidden_states
+class UNet(nn.Module):
+    def __init__(self,
+                 model_channels,
+                 init_channels=None,
+                 num_channels=3,
+                 out_channels=4,
+                 time_embed_dim=None,
+                 context_dim=None,
+                 groups=32,
+                 head_dim=64,
+                 expansion_ratio=4,
+                 compression_ratio=2,
+                 dim_mult=(1, 2, 4, 8),
+                 num_blocks=(3, 3, 3, 3),
+                 add_cross_attention=(False, True, True, True),
+                 add_self_attention=(False, True, True, True),
+                 *args,
+                 **kwargs,
+                 ):
+        super().__init__()
+        init_channels = init_channels or model_channels
+        self.to_time_embed = nn.Sequential(
+            SinusoidalPosEmb(init_channels),
+            nn.Linear(init_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim)
+        )
+        self.feature_pooling = AttentionPolling(time_embed_dim, context_dim, head_dim)
+        self.in_layer = nn.Conv2d(num_channels, init_channels, kernel_size=3, padding=1)
+        hidden_dims = [init_channels, *map(lambda mult: model_channels * mult, dim_mult)]
+        in_out_dims = list(zip(hidden_dims[:-1], hidden_dims[1:]))
+        text_dims = [set_default_item(is_exist, context_dim) for is_exist in add_cross_attention]
+        layer_params = [num_blocks, text_dims, add_self_attention]
+        rev_layer_params = map(reversed, layer_params)
+        cat_dims = []
+        self.num_levels = len(in_out_dims)
+        self.down_samples = nn.ModuleList([])
+        for level, ((in_dim, out_dim), res_block_num, text_dim, self_attention) in enumerate(zip(in_out_dims, *layer_params)):
+            down_sample = level != (self.num_levels - 1)
+            cat_dims.append(set_default_item(level != (self.num_levels - 1), out_dim, 0))
+            self.down_samples.append(
+                DownSampleBlock(
+                    in_dim, out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim, expansion_ratio,
+                    compression_ratio, down_sample, self_attention
+                )
+            )
+        self.up_samples = nn.ModuleList([])
+        for level, ((out_dim, in_dim), res_block_num, text_dim, self_attention) in enumerate(zip(reversed(in_out_dims), *rev_layer_params)):
+            up_sample = level != 0
+            self.up_samples.append(
+                UpSampleBlock(
+                    in_dim, cat_dims.pop(), out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim,
+                    expansion_ratio, compression_ratio, up_sample, self_attention
+                )
+            )
+        self.out_layer = nn.Sequential(
+            nn.GroupNorm(groups, init_channels),
+            nn.SiLU(),
+            nn.Conv2d(init_channels, out_channels, kernel_size=3, padding=1)
+        )
+        self.control_net = None
+    def forward(self, x, time, context=None, context_mask=None, control_net_residual=None):
+        time_embed = self.to_time_embed(time)
+        if exist(context):
+            time_embed = self.feature_pooling(time_embed, context, context_mask)
+        hidden_states = []
+        x = self.in_layer(x)
+        for level, down_sample in enumerate(self.down_samples):
+            x = down_sample(x, time_embed, context, context_mask, control_net_residual)
+            if level != self.num_levels - 1:
+                hidden_states.append(x)
+        for level, up_sample in enumerate(self.up_samples):
+            if level != 0:
+                x = torch.cat([x, hidden_states.pop()], dim=1)
+            x = up_sample(x, time_embed, context, context_mask)
+        x = self.out_layer(x)
+        return x
+class ControlNetModel(nn.Module):
+    def __init__(self,
+                 model_channels,
+                 init_channels=None,
+                 num_channels=3,
+                 out_channels=4,
+                 time_embed_dim=None,
+                 context_dim=None,
+                 groups=32,
+                 head_dim=64,
+                 expansion_ratio=4,
+                 compression_ratio=2,
+                 dim_mult=(1, 2, 4, 8),
+                 num_blocks=(3, 3, 3, 3),
+                 add_cross_attention=(False, True, True, True),
+                 add_self_attention=(False, True, True, True),
+                *args,
+                 **kwargs,
+                 ):
+        super().__init__()
+        init_channels = init_channels or model_channels
+        self.to_time_embed = nn.Sequential(
+            SinusoidalPosEmb(init_channels),
+            nn.Linear(init_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim)
+        )
+        self.feature_pooling = AttentionPolling(time_embed_dim, context_dim, head_dim)
+        self.in_layer = nn.Conv2d(num_channels, init_channels, kernel_size=3, padding=1)
+        hidden_dims = [init_channels, *map(lambda mult: model_channels * mult, dim_mult)]
+        in_out_dims = list(zip(hidden_dims[:-1], hidden_dims[1:]))
+        text_dims = [set_default_item(is_exist, context_dim) for is_exist in add_cross_attention]
+        layer_params = [num_blocks, text_dims, add_self_attention]
+        rev_layer_params = map(reversed, layer_params)
+        cat_dims = []
+        self.num_levels = len(in_out_dims)
+        self.down_samples = nn.ModuleList([])
+        for level, ((in_dim, out_dim), res_block_num, text_dim, self_attention) in enumerate(zip(in_out_dims, *layer_params)):
+            down_sample = level != (self.num_levels - 1)
+            cat_dims.append(set_default_item(level != (self.num_levels - 1), out_dim, 0))
+            self.down_samples.append(
+                DownSampleBlock(
+                    in_dim, out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim, expansion_ratio,
+                    compression_ratio, down_sample, self_attention
+                )
+            )
+    def forward(self, x, time, context=None, context_mask=None):
+        time_embed = self.to_time_embed(time)
+        if exist(context):
+            time_embed = self.feature_pooling(time_embed, context, context_mask)
+        hidden_states = []
+        x = self.in_layer(x)
+        for level, down_sample in enumerate(self.down_samples):
+            x = down_sample(x, time_embed, context, context_mask)
+            if level != self.num_levels - 1:
+                hidden_states.append(x)
+        return hidden_states
+class ControlUNet(nn.Module):
+    def __init__(self,
+                 model_channels,
+                 init_channels=None,
+                 num_channels=3,
+                 out_channels=4,
+                 time_embed_dim=None,
+                 context_dim=None,
+                 groups=32,
+                 head_dim=64,
+                 expansion_ratio=4,
+                 compression_ratio=2,
+                 dim_mult=(1, 2, 4, 8),
+                 num_blocks=(3, 3, 3, 3),
+                 add_cross_attention=(False, True, True, True),
+                 add_self_attention=(False, True, True, True),
+                 control_net_channels=5,
+                 *args,
+                 **kwargs,
+                 ):
+        super().__init__()
+        init_channels = init_channels or model_channels
+        self.to_time_embed = nn.Sequential(
+            SinusoidalPosEmb(init_channels),
+            nn.Linear(init_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim)
+        )
+        self.feature_pooling = AttentionPolling(time_embed_dim, context_dim, head_dim)
+        self.in_layer = nn.Conv2d(num_channels, init_channels, kernel_size=3, padding=1)
+        hidden_dims = [init_channels, *map(lambda mult: model_channels * mult, dim_mult)]
+        in_out_dims = list(zip(hidden_dims[:-1], hidden_dims[1:]))
+        text_dims = [set_default_item(is_exist, context_dim) for is_exist in add_cross_attention]
+        layer_params = [num_blocks, text_dims, add_self_attention]
+        rev_layer_params = map(reversed, layer_params)
+        cat_dims = []
+        self.num_levels = len(in_out_dims)
+        self.down_samples = nn.ModuleList([])
+        for level, ((in_dim, out_dim), res_block_num, text_dim, self_attention) in enumerate(zip(in_out_dims, *layer_params)):
+            down_sample = level != (self.num_levels - 1)
+            cat_dims.append(set_default_item(level != (self.num_levels - 1), out_dim, 0))
+            self.down_samples.append(
+                DownSampleBlock(
+                    in_dim, out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim, expansion_ratio,
+                    compression_ratio, down_sample, self_attention
+                )
+            )
+        self.up_samples = nn.ModuleList([])
+        for level, ((out_dim, in_dim), res_block_num, text_dim, self_attention) in enumerate(zip(reversed(in_out_dims), *rev_layer_params)):
+            up_sample = level != 0
+            self.up_samples.append(
+                UpSampleBlock(
+                    in_dim, cat_dims.pop(), out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim,
+                    expansion_ratio, compression_ratio, up_sample, self_attention
+                )
+            )
+        self.out_layer = nn.Sequential(
+            nn.GroupNorm(groups, init_channels),
+            nn.SiLU(),
+            nn.Conv2d(init_channels, out_channels, kernel_size=3, padding=1)
+        )
+        self.control_net = ControlNetModel(model_channels,
+                                            init_channels,
+                                            control_net_channels,
+                                            out_channels,
+                                            time_embed_dim,
+                                            context_dim,
+                                            groups,
+                                            head_dim,
+                                            expansion_ratio,
+                                            compression_ratio,
+                                            dim_mult,
+                                            num_blocks,
+                                            add_cross_attention,
+                                            add_self_attention)
+    def forward(self, x, time, context=None, context_mask=None, control_net_data=None):
+        time_embed = self.to_time_embed(time)
+        if exist(context):
+            time_embed = self.feature_pooling(time_embed, context, context_mask)
+        control_net_hiddens =  self.control_net(control_net_data, time, context, context_mask)
+        hidden_states = []
+        x = self.in_layer(x)
+        for level, down_sample in enumerate(self.down_samples):
+            x = down_sample(x, time_embed, context, context_mask)
+            if level != self.num_levels - 1:
+                x += control_net_hiddens.pop(0)
+                hidden_states.append(x)
+        for level, up_sample in enumerate(self.up_samples):
+            if level != 0:
+                x = torch.cat([x, hidden_states.pop()], dim=1)
+            x = up_sample(x, time_embed, context, context_mask)
+        x = self.out_layer(x)
+        return x
+def get_control_unet(conf):
+    unet = ControlUNet(**conf)
+    return unet
+def get_unet(conf):
+    unet = UNet(**conf)
+    return unet

kandinsky3/model/__init__.py ADDED Viewed

File without changes

kandinsky3/model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (212 Bytes). View file

kandinsky3/model/__pycache__/diffusion.cpython-310.pyc ADDED Viewed

Binary file (6.4 kB). View file

kandinsky3/model/__pycache__/nn.cpython-310.pyc ADDED Viewed

Binary file (3.53 kB). View file

kandinsky3/model/__pycache__/unet.cpython-310.pyc ADDED Viewed

Binary file (13.9 kB). View file

kandinsky3/model/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.85 kB). View file

kandinsky3/model/diffusion.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import math
+import torch
+from einops import rearrange
+from tqdm import tqdm
+from .utils import get_tensor_items
+def get_named_beta_schedule(schedule_name, timesteps):
+    if schedule_name == "linear":
+        scale = 1000 / timesteps
+        beta_start = scale * 0.0001
+        beta_end = scale * 0.02
+        return torch.linspace(
+            beta_start, beta_end, timesteps, dtype=torch.float32
+        )
+    elif schedule_name == "cosine":
+        alpha_bar = lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+        betas = []
+        for i in range(timesteps):
+            t1 = i / timesteps
+            t2 = (i + 1) / timesteps
+            betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), 0.999))
+        return torch.tensor(betas, dtype=torch.float32)
+class BaseDiffusion:
+    def __init__(self, betas, percentile=None, gen_noise=torch.randn_like):
+        self.betas = betas
+        self.num_timesteps = betas.shape[0]
+        alphas = 1. - betas
+        self.alphas_cumprod = torch.cumprod(alphas, dim=0)
+        self.alphas_cumprod_prev = torch.cat([torch.ones(1, dtype=betas.dtype), self.alphas_cumprod[:-1]])
+        # calculate q(x_t | x_{t-1})
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1. - self.alphas_cumprod)
+        # calculate q(x_{t-1} | x_t, x_0)
+        self.posterior_mean_coef_1 = torch.sqrt(self.alphas_cumprod_prev) * betas / (1. - self.alphas_cumprod)
+        self.posterior_mean_coef_2 = torch.sqrt(alphas) * (1. - self.alphas_cumprod_prev) / (1. - self.alphas_cumprod)
+        self.posterior_variance = betas * (1. - self.alphas_cumprod_prev) / (1. - self.alphas_cumprod)
+        self.posterior_log_variance = torch.log(
+            torch.cat([self.posterior_variance[1].unsqueeze(0), self.posterior_variance[1:]])
+        )
+        self.percentile = percentile
+        self.time_scale = 1000 // self.num_timesteps
+        self.gen_noise = gen_noise
+        self.jump_length = 3
+    def process_x_start(self, x_start):
+        bs, ndims = x_start.shape[0], len(x_start.shape[1:])
+        if self.percentile is not None:
+            quantile = torch.quantile(
+                rearrange(x_start, 'b ... -> b (...)').abs(),
+                self.percentile,
+                dim=-1
+            )
+            quantile = torch.clip(quantile, min=1.)
+            quantile = quantile.reshape(bs, *((1,) * ndims))
+            return torch.clip(x_start, -quantile, quantile) / quantile
+        else:
+            return torch.clip(x_start, -1., 1.)
+    def get_x_start(self, x, t, noise):
+        sqrt_one_minus_alphas_cumprod = get_tensor_items(self.sqrt_one_minus_alphas_cumprod, t, noise.shape)
+        sqrt_alphas_cumprod = get_tensor_items(self.sqrt_alphas_cumprod, t, noise.shape)
+        pred_x_start = (x - sqrt_one_minus_alphas_cumprod * noise) / sqrt_alphas_cumprod
+        return pred_x_start
+    def get_noise(self, x, t, x_start):
+        sqrt_one_minus_alphas_cumprod = get_tensor_items(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
+        sqrt_alphas_cumprod = get_tensor_items(self.sqrt_alphas_cumprod, t, x_start.shape)
+        pred_noise = (x - sqrt_alphas_cumprod * x_start) / sqrt_one_minus_alphas_cumprod
+        return pred_noise
+    def q_sample(self, x_start, t, noise=None):
+        if noise is None:
+            noise = self.gen_noise(x_start)
+        sqrt_alphas_cumprod = get_tensor_items(self.sqrt_alphas_cumprod, t, x_start.shape)
+        sqrt_one_minus_alphas_cumprod = get_tensor_items(self.sqrt_one_minus_alphas_cumprod, t, noise.shape)
+        x_t = sqrt_alphas_cumprod * x_start + sqrt_one_minus_alphas_cumprod * noise
+        return x_t
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        posterior_mean_coef_1 = get_tensor_items(self.posterior_mean_coef_1, t, x_start.shape)
+        posterior_mean_coef_2 = get_tensor_items(self.posterior_mean_coef_2, t, x_t.shape)
+        posterior_mean = posterior_mean_coef_1 * x_start + posterior_mean_coef_2 * x_t
+        posterior_variance = get_tensor_items(self.posterior_variance, t, x_start.shape)
+        posterior_log_variance = get_tensor_items(self.posterior_log_variance, t, x_start.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance
+    def q_posterior_variance(self, t, prev_t, shape, eta=1., ):
+        alphas_cumprod = get_tensor_items(self.alphas_cumprod, t, shape)
+        prev_alphas_cumprod = get_tensor_items(self.alphas_cumprod, prev_t, shape)
+        posterior_variance = torch.sqrt(
+            eta * (1. - alphas_cumprod / prev_alphas_cumprod) * (1. - prev_alphas_cumprod) / (1. - alphas_cumprod)
+        )
+        return posterior_variance
+    def text_guidance(
+            self, model, x, t, context, context_mask, null_embedding, guidance_weight_text,
+            uncondition_context=None, uncondition_context_mask=None, mask=None, masked_latent=None
+    ):
+        large_x = x.repeat(2, 1, 1, 1)
+        large_t = t.repeat(2).to(x.dtype)
+        if uncondition_context is None:
+            uncondition_context = torch.zeros_like(context)
+            uncondition_context_mask = torch.zeros_like(context_mask)
+            uncondition_context[:, 0] = null_embedding
+            uncondition_context_mask[:, 0] = 1
+        large_context = torch.cat([context, uncondition_context])
+        large_context_mask = torch.cat([context_mask, uncondition_context_mask])
+        if mask is not None:
+            mask = mask.repeat(2, 1, 1, 1)
+        if masked_latent is not None:
+            masked_latent = masked_latent.repeat(2, 1, 1, 1)
+        if model.in_layer.in_channels == 9:
+            large_x = torch.cat([large_x, mask, masked_latent], dim=1)
+        pred_large_noise = model(large_x, large_t * self.time_scale, large_context, large_context_mask.bool())
+        pred_noise, uncond_pred_noise = torch.chunk(pred_large_noise, 2)
+        pred_noise = (guidance_weight_text + 1.) * pred_noise - guidance_weight_text * uncond_pred_noise
+        return pred_noise
+    def p_mean_variance(
+            self, model, x, t, prev_t, context, context_mask, null_embedding, guidance_weight_text, eta=1.,
+            negative_context=None, negative_context_mask=None, mask=None, masked_latent=None
+    ):
+        pred_noise = self.text_guidance(
+            model, x, t, context, context_mask, null_embedding, guidance_weight_text,
+            negative_context, negative_context_mask, mask, masked_latent
+        )
+        pred_x_start = self.get_x_start(x, t, pred_noise)
+        pred_x_start = self.process_x_start(pred_x_start)
+        pred_noise = self.get_noise(x, t, pred_x_start)
+        pred_var = self.q_posterior_variance(t, prev_t, x.shape, eta)
+        prev_alphas_cumprod = get_tensor_items(self.alphas_cumprod, prev_t, x.shape)
+        pred_mean = torch.sqrt(prev_alphas_cumprod) * pred_x_start
+        pred_mean += torch.sqrt(1. - prev_alphas_cumprod - pred_var ** 2) * pred_noise
+        return pred_mean, pred_var
+    # @torch.no_grad()
+    def p_sample(
+            self, model, x, t, prev_t, context, context_mask, null_embedding, guidance_weight_text, eta=1.,
+            negative_context=None, negative_context_mask=None, mask=None, masked_latent=None
+    ):
+        bs = x.shape[0]
+        ndims = len(x.shape[1:])
+        pred_mean, pred_var = self.p_mean_variance(
+            model, x, t, prev_t, context, context_mask, null_embedding, guidance_weight_text, eta,
+            negative_context=negative_context, negative_context_mask=negative_context_mask,
+            mask=mask, masked_latent=masked_latent
+        )
+        noise = torch.randn_like(x)
+        mask = (prev_t != 0).reshape(bs, *((1,) * ndims))
+        sample = pred_mean + mask * pred_var * noise
+        return sample
+    # @torch.no_grad()
+    def p_sample_loop(
+            self, model, shape, times, device, context, context_mask, null_embedding, guidance_weight_text, eta=1.,
+            negative_context=None, negative_context_mask=None, mask=None, masked_latent=None, gan=False,
+    ):
+        img = torch.randn(*shape, device=device)
+        times = times + [0, ]
+        times = list(zip(times[:-1], times[1:]))
+        for time, prev_time in tqdm(times):
+            time = torch.tensor([time] * shape[0], device=device)
+            if gan:
+                x_t = self.q_sample(img, time)
+                pred_noise = model(x_t, time.type(x_t.dtype), context, context_mask.bool())
+                img = self.get_x_start(x_t, time, pred_noise)
+            else:
+                prev_time = torch.tensor([prev_time] * shape[0], device=device)
+                img = self.p_sample(
+                    model, img, time, prev_time, context, context_mask, null_embedding, guidance_weight_text, eta,
+                    negative_context=negative_context, negative_context_mask=negative_context_mask,
+                    mask=mask, masked_latent=masked_latent
+                )
+        return img
+def get_diffusion(conf):
+    betas = get_named_beta_schedule(**conf.schedule_params)
+    base_diffusion = BaseDiffusion(betas, **conf.diffusion_params)
+    return base_diffusion

kandinsky3/model/nn.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import math
+import torch
+from torch import nn, einsum
+from einops import rearrange, repeat
+from .utils import exist
+class Identity(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+    @staticmethod
+    def forward(x, *args, **kwargs):
+        return x
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=x.device, dtype=x.dtype) * -emb)
+        emb = rearrange(x, 'i -> i 1') * rearrange(emb, 'j -> 1 j')
+        return torch.cat((emb.sin(), emb.cos()), dim=-1)
+class ConditionalGroupNorm(nn.Module):
+    def __init__(self, groups, normalized_shape, context_dim):
+        super().__init__()
+        self.norm = nn.GroupNorm(groups, normalized_shape, affine=False)
+        self.context_mlp = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(context_dim, 2 * normalized_shape)
+        )
+        self.context_mlp[1].weight.data.zero_()
+        self.context_mlp[1].bias.data.zero_()
+    def forward(self, x, context):
+        context = self.context_mlp(context)
+        ndims = ' 1' * len(x.shape[2:])
+        context = rearrange(context, f'b c -> b c{ndims}')
+        scale, shift = context.chunk(2, dim=1)
+        x = self.norm(x) * (scale + 1.) + shift
+        return x
+class Attention(nn.Module):
+    def __init__(self, in_channels, out_channels, context_dim, head_dim=64):
+        super().__init__()
+        assert out_channels % head_dim == 0
+        self.num_heads = out_channels // head_dim
+        self.scale = head_dim ** -0.5
+        self.to_query = nn.Linear(in_channels, out_channels, bias=False)
+        self.to_key = nn.Linear(context_dim, out_channels, bias=False)
+        self.to_value = nn.Linear(context_dim, out_channels, bias=False)
+        self.output_layer = nn.Linear(out_channels, out_channels, bias=False)
+    def forward(self, x, context, context_mask=None):
+        query = rearrange(self.to_query(x), 'b n (h d) -> b h n d', h=self.num_heads)
+        key = rearrange(self.to_key(context), 'b n (h d) -> b h n d', h=self.num_heads)
+        value = rearrange(self.to_value(context), 'b n (h d) -> b h n d', h=self.num_heads)
+        attention_matrix = einsum('b h i d, b h j d -> b h i j', query, key) * self.scale
+        if exist(context_mask):
+            max_neg_value = -torch.finfo(attention_matrix.dtype).max
+            context_mask = rearrange(context_mask, 'b j -> b 1 1 j')
+            attention_matrix = attention_matrix.masked_fill(~context_mask, max_neg_value)
+        attention_matrix = attention_matrix.softmax(dim=-1)
+        out = einsum('b h i j, b h j d -> b h i d', attention_matrix, value)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        out = self.output_layer(out)
+        return out

kandinsky3/model/unet.py ADDED Viewed

	@@ -0,0 +1,516 @@

+import torch
+from torch import nn, einsum
+from einops import rearrange
+from .nn import Identity, Attention, SinusoidalPosEmb, ConditionalGroupNorm
+from .utils import exist, set_default_item, set_default_layer
+import torch.nn.functional as F
+class Block(nn.Module):
+    def __init__(self, in_channels, out_channels, time_embed_dim, kernel_size=3, norm_groups=32, up_resolution=None):
+        super().__init__()
+        self.group_norm = ConditionalGroupNorm(norm_groups, in_channels, time_embed_dim)
+        self.activation = nn.SiLU()
+        self.up_sample = set_default_layer(
+            exist(up_resolution) and up_resolution,
+            nn.ConvTranspose2d, (in_channels, in_channels), {'kernel_size': 2, 'stride': 2}
+        )
+        padding = set_default_item(kernel_size == 1, 0, 1)
+        self.projection = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding)
+        self.down_sample = set_default_layer(
+            exist(up_resolution) and not up_resolution,
+            nn.Conv2d, (out_channels, out_channels), {'kernel_size': 2, 'stride': 2}
+        )
+    def forward(self, x, time_embed):
+        x = self.group_norm(x, time_embed)
+        x = self.activation(x)
+        x = self.up_sample(x)
+        x = self.projection(x)
+        x = self.down_sample(x)
+        return x
+class ResNetBlock(nn.Module):
+    def __init__(
+            self, in_channels, out_channels, time_embed_dim, norm_groups=32, compression_ratio=2, up_resolutions=4*[None]
+    ):
+        super().__init__()
+        kernel_sizes = [1, 3, 3, 1]
+        hidden_channel = max(in_channels, out_channels) // compression_ratio
+        hidden_channels = [(in_channels, hidden_channel)] + [(hidden_channel, hidden_channel)] * 2 + [(hidden_channel, out_channels)]
+        self.resnet_blocks = nn.ModuleList([
+            Block(in_channel, out_channel, time_embed_dim, kernel_size, norm_groups, up_resolution)
+            for (in_channel, out_channel), kernel_size, up_resolution in zip(hidden_channels, kernel_sizes, up_resolutions)
+        ])
+        self.shortcut_up_sample = set_default_layer(
+            True in up_resolutions,
+            nn.ConvTranspose2d, (in_channels, in_channels), {'kernel_size': 2, 'stride': 2}
+        )
+        self.shortcut_projection = set_default_layer(
+            in_channels != out_channels,
+            nn.Conv2d, (in_channels, out_channels), {'kernel_size': 1}
+        )
+        self.shortcut_down_sample = set_default_layer(
+            False in up_resolutions,
+            nn.Conv2d, (out_channels, out_channels), {'kernel_size': 2, 'stride': 2}
+        )
+    def forward(self, x, time_embed):
+        out = x
+        for resnet_block in self.resnet_blocks:
+            out = resnet_block(out, time_embed)
+        x = self.shortcut_up_sample(x)
+        x = self.shortcut_projection(x)
+        x = self.shortcut_down_sample(x)
+        x = x + out
+        return x
+class AttentionPolling(nn.Module):
+    def __init__(self, num_channels, context_dim, head_dim=64):
+        super().__init__()
+        self.attention = Attention(context_dim, num_channels, context_dim, head_dim)
+    def forward(self, x, context, context_mask=None):
+        context = self.attention(context.mean(dim=1, keepdim=True), context, context_mask)
+        return x + context.squeeze(1)
+class AttentionBlock(nn.Module):
+    def __init__(self, num_channels, time_embed_dim, context_dim=None, norm_groups=32, head_dim=64, expansion_ratio=4):
+        super().__init__()
+        self.in_norm = ConditionalGroupNorm(norm_groups, num_channels, time_embed_dim)
+        self.attention = Attention(num_channels, num_channels, context_dim or num_channels, head_dim)
+        hidden_channels = expansion_ratio * num_channels
+        self.out_norm = ConditionalGroupNorm(norm_groups, num_channels, time_embed_dim)
+        self.feed_forward = nn.Sequential(
+            nn.Conv2d(num_channels, hidden_channels, kernel_size=1, bias=False),
+            nn.SiLU(),
+            nn.Conv2d(hidden_channels, num_channels, kernel_size=1, bias=False),
+        )
+    def forward(self, x, time_embed, context=None, context_mask=None):
+        height, width = x.shape[-2:]
+        out = self.in_norm(x, time_embed)
+        out = rearrange(out, 'b c h w -> b (h w) c', h=height, w=width)
+        context = set_default_item(exist(context), context, out)
+        out = self.attention(out, context, context_mask)
+        out = rearrange(out, 'b (h w) c -> b c h w', h=height, w=width)
+        x = x + out
+        out = self.out_norm(x, time_embed)
+        out = self.feed_forward(out)
+        x = x + out
+        return x
+class DownSampleBlock(nn.Module):
+    def __init__(
+            self, in_channels, out_channels, time_embed_dim, context_dim=None,
+            num_blocks=3, groups=32, head_dim=64, expansion_ratio=4, compression_ratio=2,
+            down_sample=True, self_attention=True
+    ):
+        super().__init__()
+        self.self_attention_block = set_default_layer(
+            self_attention,
+            AttentionBlock,
+            (in_channels, time_embed_dim, None, groups, head_dim, expansion_ratio),
+            layer_2=Identity
+        )
+        up_resolutions = [[None] * 4] * (num_blocks - 1) + [[None, None, set_default_item(down_sample, False), None]]
+        hidden_channels = [(in_channels, out_channels)] + [(out_channels, out_channels)] * (num_blocks - 1)
+        self.resnet_attn_blocks = nn.ModuleList([
+            nn.ModuleList([
+                ResNetBlock(in_channel, out_channel, time_embed_dim, groups, compression_ratio),
+                set_default_layer(
+                    exist(context_dim),
+                    AttentionBlock,
+                    (out_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio),
+                    layer_2=Identity
+                ),
+                ResNetBlock(out_channel, out_channel, time_embed_dim, groups, compression_ratio, up_resolution),
+            ]) for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions)
+        ])
+    def forward(self, x, time_embed, context=None, context_mask=None, control_net_residual=None):
+        x = self.self_attention_block(x, time_embed)
+        for in_resnet_block, attention, out_resnet_block in self.resnet_attn_blocks:
+            x = in_resnet_block(x, time_embed)
+            x = attention(x, time_embed, context, context_mask)
+            x = out_resnet_block(x, time_embed)
+        return x
+class UpSampleBlock(nn.Module):
+    def __init__(
+            self, in_channels, cat_dim, out_channels, time_embed_dim, context_dim=None,
+            num_blocks=3, groups=32, head_dim=64, expansion_ratio=4, compression_ratio=2,
+            up_sample=True, self_attention=True
+    ):
+        super().__init__()
+        up_resolutions = [[None, set_default_item(up_sample, True), None, None]] + [[None] * 4] * (num_blocks - 1)
+        hidden_channels = [(in_channels + cat_dim, in_channels)] + [(in_channels, in_channels)] * (num_blocks - 2) + [(in_channels, out_channels)]
+        self.resnet_attn_blocks = nn.ModuleList([
+            nn.ModuleList([
+                ResNetBlock(in_channel, in_channel, time_embed_dim, groups, compression_ratio, up_resolution),
+                set_default_layer(
+                    exist(context_dim),
+                    AttentionBlock,
+                    (in_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio),
+                    layer_2=Identity
+                ),
+                ResNetBlock(in_channel, out_channel, time_embed_dim, groups, compression_ratio),
+            ]) for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions)
+        ])
+        self.self_attention_block = set_default_layer(
+            self_attention,
+            AttentionBlock,
+            (out_channels, time_embed_dim, None, groups, head_dim, expansion_ratio),
+            layer_2=Identity
+        )
+    def forward(self, x, time_embed, context=None, context_mask=None):
+        for in_resnet_block, attention, out_resnet_block in self.resnet_attn_blocks:
+            x = in_resnet_block(x, time_embed)
+            x = attention(x, time_embed, context, context_mask)
+            x = out_resnet_block(x, time_embed)
+        x = self.self_attention_block(x, time_embed)
+        return x
+class ControlNetModel(nn.Module):
+    def __init__(self,
+                 model_channels,
+                 init_channels=None,
+                 num_channels=3,
+                 out_channels=4,
+                 time_embed_dim=None,
+                 context_dim=None,
+                 groups=32,
+                 head_dim=64,
+                 expansion_ratio=4,
+                 compression_ratio=2,
+                 dim_mult=(1, 2, 4, 8),
+                 num_blocks=(3, 3, 3, 3),
+                 add_cross_attention=(False, True, True, True),
+                 add_self_attention=(False, True, True, True)
+                 ):
+        super().__init__()
+        init_channels = init_channels or model_channels
+        self.to_time_embed = nn.Sequential(
+            SinusoidalPosEmb(init_channels),
+            nn.Linear(init_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim)
+        )
+        self.feature_pooling = AttentionPolling(time_embed_dim, context_dim, head_dim)
+        self.in_layer = nn.Conv2d(num_channels, init_channels, kernel_size=3, padding=1)
+        hidden_dims = [init_channels, *map(lambda mult: model_channels * mult, dim_mult)]
+        in_out_dims = list(zip(hidden_dims[:-1], hidden_dims[1:]))
+        text_dims = [set_default_item(is_exist, context_dim) for is_exist in add_cross_attention]
+        layer_params = [num_blocks, text_dims, add_self_attention]
+        rev_layer_params = map(reversed, layer_params)
+        cat_dims = []
+        self.num_levels = len(in_out_dims)
+        self.down_samples = nn.ModuleList([])
+        for level, ((in_dim, out_dim), res_block_num, text_dim, self_attention) in enumerate(zip(in_out_dims, *layer_params)):
+            down_sample = level != (self.num_levels - 1)
+            cat_dims.append(set_default_item(level != (self.num_levels - 1), out_dim, 0))
+            self.down_samples.append(
+                DownSampleBlock(
+                    in_dim, out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim, expansion_ratio,
+                    compression_ratio, down_sample, self_attention
+                )
+            )
+    def forward(self, x, time, context=None, context_mask=None):
+        time_embed = self.to_time_embed(time)
+        if exist(context):
+            time_embed = self.feature_pooling(time_embed, context, context_mask)
+        hidden_states = []
+        x = self.in_layer(x)
+        for level, down_sample in enumerate(self.down_samples):
+            x = down_sample(x, time_embed, context, context_mask)
+            if level != self.num_levels - 1:
+                hidden_states.append(x)
+        return hidden_states
+class UNet(nn.Module):
+    def __init__(self,
+                 model_channels,
+                 init_channels=None,
+                 num_channels=3,
+                 out_channels=4,
+                 time_embed_dim=None,
+                 context_dim=None,
+                 groups=32,
+                 head_dim=64,
+                 expansion_ratio=4,
+                 compression_ratio=2,
+                 dim_mult=(1, 2, 4, 8),
+                 num_blocks=(3, 3, 3, 3),
+                 add_cross_attention=(False, True, True, True),
+                 add_self_attention=(False, True, True, True),
+                 *args,
+                 **kwargs,
+                 ):
+        super().__init__()
+        init_channels = init_channels or model_channels
+        self.to_time_embed = nn.Sequential(
+            SinusoidalPosEmb(init_channels),
+            nn.Linear(init_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim)
+        )
+        self.feature_pooling = AttentionPolling(time_embed_dim, context_dim, head_dim)
+        self.in_layer = nn.Conv2d(num_channels, init_channels, kernel_size=3, padding=1)
+        hidden_dims = [init_channels, *map(lambda mult: model_channels * mult, dim_mult)]
+        in_out_dims = list(zip(hidden_dims[:-1], hidden_dims[1:]))
+        text_dims = [set_default_item(is_exist, context_dim) for is_exist in add_cross_attention]
+        layer_params = [num_blocks, text_dims, add_self_attention]
+        rev_layer_params = map(reversed, layer_params)
+        cat_dims = []
+        self.num_levels = len(in_out_dims)
+        self.down_samples = nn.ModuleList([])
+        for level, ((in_dim, out_dim), res_block_num, text_dim, self_attention) in enumerate(zip(in_out_dims, *layer_params)):
+            down_sample = level != (self.num_levels - 1)
+            cat_dims.append(set_default_item(level != (self.num_levels - 1), out_dim, 0))
+            self.down_samples.append(
+                DownSampleBlock(
+                    in_dim, out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim, expansion_ratio,
+                    compression_ratio, down_sample, self_attention
+                )
+            )
+        self.up_samples = nn.ModuleList([])
+        for level, ((out_dim, in_dim), res_block_num, text_dim, self_attention) in enumerate(zip(reversed(in_out_dims), *rev_layer_params)):
+            up_sample = level != 0
+            self.up_samples.append(
+                UpSampleBlock(
+                    in_dim, cat_dims.pop(), out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim,
+                    expansion_ratio, compression_ratio, up_sample, self_attention
+                )
+            )
+        self.out_layer = nn.Sequential(
+            nn.GroupNorm(groups, init_channels),
+            nn.SiLU(),
+            nn.Conv2d(init_channels, out_channels, kernel_size=3, padding=1)
+        )
+        self.control_net = None
+    def forward(self, x, time, context=None, context_mask=None, control_net_residual=None):
+        time_embed = self.to_time_embed(time)
+        if exist(context):
+            time_embed = self.feature_pooling(time_embed, context, context_mask)
+        hidden_states = []
+        x = self.in_layer(x)
+        for level, down_sample in enumerate(self.down_samples):
+            x = down_sample(x, time_embed, context, context_mask, control_net_residual)
+            if level != self.num_levels - 1:
+                hidden_states.append(x)
+        for level, up_sample in enumerate(self.up_samples):
+            if level != 0:
+                x = torch.cat([x, hidden_states.pop()], dim=1)
+            x = up_sample(x, time_embed, context, context_mask)
+        x = self.out_layer(x)
+        return x
+class ControlNetModel(nn.Module):
+    def __init__(self,
+                 model_channels,
+                 init_channels=None,
+                 num_channels=3,
+                 out_channels=4,
+                 time_embed_dim=None,
+                 context_dim=None,
+                 groups=32,
+                 head_dim=64,
+                 expansion_ratio=4,
+                 compression_ratio=2,
+                 dim_mult=(1, 2, 4, 8),
+                 num_blocks=(3, 3, 3, 3),
+                 add_cross_attention=(False, True, True, True),
+                 add_self_attention=(False, True, True, True),
+                *args,
+                 **kwargs,
+                 ):
+        super().__init__()
+        init_channels = init_channels or model_channels
+        self.to_time_embed = nn.Sequential(
+            SinusoidalPosEmb(init_channels),
+            nn.Linear(init_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim)
+        )
+        self.feature_pooling = AttentionPolling(time_embed_dim, context_dim, head_dim)
+        self.in_layer = nn.Conv2d(num_channels, init_channels, kernel_size=3, padding=1)
+        hidden_dims = [init_channels, *map(lambda mult: model_channels * mult, dim_mult)]
+        in_out_dims = list(zip(hidden_dims[:-1], hidden_dims[1:]))
+        text_dims = [set_default_item(is_exist, context_dim) for is_exist in add_cross_attention]
+        layer_params = [num_blocks, text_dims, add_self_attention]
+        rev_layer_params = map(reversed, layer_params)
+        cat_dims = []
+        self.num_levels = len(in_out_dims)
+        self.down_samples = nn.ModuleList([])
+        for level, ((in_dim, out_dim), res_block_num, text_dim, self_attention) in enumerate(zip(in_out_dims, *layer_params)):
+            down_sample = level != (self.num_levels - 1)
+            cat_dims.append(set_default_item(level != (self.num_levels - 1), out_dim, 0))
+            self.down_samples.append(
+                DownSampleBlock(
+                    in_dim, out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim, expansion_ratio,
+                    compression_ratio, down_sample, self_attention
+                )
+            )
+    def forward(self, x, time, context=None, context_mask=None):
+        time_embed = self.to_time_embed(time)
+        if exist(context):
+            time_embed = self.feature_pooling(time_embed, context, context_mask)
+        hidden_states = []
+        x = self.in_layer(x)
+        for level, down_sample in enumerate(self.down_samples):
+            x = down_sample(x, time_embed, context, context_mask)
+            if level != self.num_levels - 1:
+                hidden_states.append(x)
+        return hidden_states
+class ControlUNet(nn.Module):
+    def __init__(self,
+                 model_channels,
+                 init_channels=None,
+                 num_channels=3,
+                 out_channels=4,
+                 time_embed_dim=None,
+                 context_dim=None,
+                 groups=32,
+                 head_dim=64,
+                 expansion_ratio=4,
+                 compression_ratio=2,
+                 dim_mult=(1, 2, 4, 8),
+                 num_blocks=(3, 3, 3, 3),
+                 add_cross_attention=(False, True, True, True),
+                 add_self_attention=(False, True, True, True),
+                 control_net_channels=5,
+                 *args,
+                 **kwargs,
+                 ):
+        super().__init__()
+        init_channels = init_channels or model_channels
+        self.to_time_embed = nn.Sequential(
+            SinusoidalPosEmb(init_channels),
+            nn.Linear(init_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim)
+        )
+        self.feature_pooling = AttentionPolling(time_embed_dim, context_dim, head_dim)
+        self.in_layer = nn.Conv2d(num_channels, init_channels, kernel_size=3, padding=1)
+        hidden_dims = [init_channels, *map(lambda mult: model_channels * mult, dim_mult)]
+        in_out_dims = list(zip(hidden_dims[:-1], hidden_dims[1:]))
+        text_dims = [set_default_item(is_exist, context_dim) for is_exist in add_cross_attention]
+        layer_params = [num_blocks, text_dims, add_self_attention]
+        rev_layer_params = map(reversed, layer_params)
+        cat_dims = []
+        self.num_levels = len(in_out_dims)
+        self.down_samples = nn.ModuleList([])
+        for level, ((in_dim, out_dim), res_block_num, text_dim, self_attention) in enumerate(zip(in_out_dims, *layer_params)):
+            down_sample = level != (self.num_levels - 1)
+            cat_dims.append(set_default_item(level != (self.num_levels - 1), out_dim, 0))
+            self.down_samples.append(
+                DownSampleBlock(
+                    in_dim, out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim, expansion_ratio,
+                    compression_ratio, down_sample, self_attention
+                )
+            )
+        self.up_samples = nn.ModuleList([])
+        for level, ((out_dim, in_dim), res_block_num, text_dim, self_attention) in enumerate(zip(reversed(in_out_dims), *rev_layer_params)):
+            up_sample = level != 0
+            self.up_samples.append(
+                UpSampleBlock(
+                    in_dim, cat_dims.pop(), out_dim, time_embed_dim, text_dim, res_block_num, groups, head_dim,
+                    expansion_ratio, compression_ratio, up_sample, self_attention
+                )
+            )
+        self.out_layer = nn.Sequential(
+            nn.GroupNorm(groups, init_channels),
+            nn.SiLU(),
+            nn.Conv2d(init_channels, out_channels, kernel_size=3, padding=1)
+        )
+        self.control_net = ControlNetModel(model_channels,
+                                            init_channels,
+                                            control_net_channels,
+                                            out_channels,
+                                            time_embed_dim,
+                                            context_dim,
+                                            groups,
+                                            head_dim,
+                                            expansion_ratio,
+                                            compression_ratio,
+                                            dim_mult,
+                                            num_blocks,
+                                            add_cross_attention,
+                                            add_self_attention)
+    def forward(self, x, time, context=None, context_mask=None, control_net_data=None):
+        time_embed = self.to_time_embed(time)
+        if exist(context):
+            time_embed = self.feature_pooling(time_embed, context, context_mask)
+        control_net_hiddens =  self.control_net(control_net_data, time, context, context_mask)
+        hidden_states = []
+        x = self.in_layer(x)
+        for level, down_sample in enumerate(self.down_samples):
+            x = down_sample(x, time_embed, context, context_mask)
+            if level != self.num_levels - 1:
+                x += control_net_hiddens.pop(0)
+                hidden_states.append(x)
+        for level, up_sample in enumerate(self.up_samples):
+            if level != 0:
+                x = torch.cat([x, hidden_states.pop()], dim=1)
+            x = up_sample(x, time_embed, context, context_mask)
+        x = self.out_layer(x)
+        return x
+def get_control_unet(conf):
+    unet = ControlUNet(**conf)
+    return unet
+def get_unet(conf):
+    unet = UNet(**conf)
+    return unet

kandinsky3/model/utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from torch.nn import Identity
+from einops import rearrange
+def exist(item):
+    return item is not None
+def set_default_item(condition, item_1, item_2=None):
+    if condition:
+        return item_1
+    else:
+        return item_2
+def set_default_layer(condition, layer_1, args_1=[], kwargs_1={}, layer_2=Identity, args_2=[], kwargs_2={}):
+    if condition:
+        return layer_1(*args_1, **kwargs_1)
+    else:
+        return layer_2(*args_2, **kwargs_2)
+def get_tensor_items(x, pos, broadcast_shape):
+    device = pos.device
+    bs = pos.shape[0]
+    ndims = len(broadcast_shape[1:])
+    x = x.cpu()[pos.cpu()]
+    return x.reshape(bs, *((1,) * ndims)).to(device)
+def local_patching(x, height, width, group_size):
+    if group_size > 0:
+        x = rearrange(
+            x, 'b c (h g1) (w g2) -> b (h w) (g1 g2) c',
+            h=height//group_size, w=width//group_size, g1=group_size, g2=group_size
+        )
+    else:
+        x = rearrange(x, 'b c h w -> b (h w) c', h=height, w=width)
+    return x
+def local_merge(x, height, width, group_size):
+    if group_size > 0:
+        x = rearrange(
+            x, 'b (h w) (g1 g2) c -> b c (h g1) (w g2)',
+            h=height//group_size, w=width//group_size, g1=group_size, g2=group_size
+        )
+    else:
+        x = rearrange(x, 'b (h w) c -> b c h w', h=height, w=width)
+    return x
+def global_patching(x, height, width, group_size):
+    x = local_patching(x, height, width, height//group_size)
+    x = x.transpose(-2, -3)
+    return x
+def global_merge(x, height, width, group_size):
+    x = x.transpose(-2, -3)
+    x = local_merge(x, height, width, height//group_size)
+    return x

kandinsky3/movq.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import math
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+from .utils import freeze
+def nonlinearity(x):
+    return x*torch.sigmoid(x)
+class SpatialNorm(nn.Module):
+    def __init__(
+        self, f_channels, zq_channels=None, norm_layer=nn.GroupNorm, freeze_norm_layer=False, add_conv=False, **norm_layer_params
+    ):
+        super().__init__()
+        self.norm_layer = norm_layer(num_channels=f_channels, **norm_layer_params)
+        if zq_channels is not None:
+            if freeze_norm_layer:
+                for p in self.norm_layer.parameters:
+                    p.requires_grad = False
+            self.add_conv = add_conv
+            if self.add_conv:
+                self.conv = nn.Conv2d(zq_channels, zq_channels, kernel_size=3, stride=1, padding=1)
+            self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+            self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, f, zq=None):
+        norm_f = self.norm_layer(f)
+        if zq is not None:
+            f_size = f.shape[-2:]
+            zq = torch.nn.functional.interpolate(zq, size=f_size, mode="nearest")
+            if self.add_conv:
+                zq = self.conv(zq)
+            norm_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
+        return norm_f
+def Normalize(in_channels, zq_ch=None, add_conv=None):
+    return SpatialNorm(
+            in_channels, zq_ch, norm_layer=nn.GroupNorm,
+            freeze_norm_layer=False, add_conv=add_conv, num_groups=32, eps=1e-6, affine=True
+        )
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512, zq_ch=None, add_conv=False):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels, zq_ch, add_conv=add_conv)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = Normalize(out_channels, zq_ch, add_conv=add_conv)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb, zq=None):
+        h = x
+        h = self.norm1(h, zq)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+        h = self.norm2(h, zq)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels, zq_ch=None, add_conv=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels, zq_ch, add_conv=add_conv)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x, zq=None):
+        h_ = x
+        h_ = self.norm(h_, zq)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x+h_
+class Encoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, **ignore_kwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, give_pre_end=False, zq_ch=None, add_conv=False, **ignorekwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,)+tuple(ch_mult)
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        self.z_shape = (1,z_channels,curr_res,curr_res)
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout,
+                                       zq_ch=zq_ch,
+                                       add_conv=add_conv)
+        self.mid.attn_1 = AttnBlock(block_in, zq_ch, add_conv=add_conv)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout,
+                                       zq_ch=zq_ch,
+                                       add_conv=add_conv)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout,
+                                         zq_ch=zq_ch,
+                                         add_conv=add_conv))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in, zq_ch, add_conv=add_conv))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in, zq_ch, add_conv=add_conv)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z, zq):
+        #assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb, zq)
+        h = self.mid.attn_1(h, zq)
+        h = self.mid.block_2(h, temb, zq)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h, temb, zq)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h, zq)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h, zq)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class MoVQ(nn.Module):
+    def __init__(self, generator_params):
+        super().__init__()
+        z_channels = generator_params["z_channels"]
+        self.encoder = Encoder(**generator_params)
+        self.quant_conv = torch.nn.Conv2d(z_channels, z_channels, 1)
+        self.post_quant_conv = torch.nn.Conv2d(z_channels, z_channels, 1)
+        self.decoder = Decoder(zq_ch=z_channels, **generator_params)
+    # @torch.no_grad()
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+    # @torch.no_grad()
+    def decode(self, quant):
+        decoder_input = self.post_quant_conv(quant)
+        decoded = self.decoder(decoder_input, quant)
+        return decoded
+def get_vae(conf):
+    movq = MoVQ(conf.params)
+    if conf.checkpoint is not None:
+        movq_state_dict = torch.load(conf.checkpoint)
+        movq.load_state_dict(movq_state_dict)
+    movq = freeze(movq)
+    return movq

kandinsky3/setup.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from setuptools import setup
+setup(
+    name="kandinsky3",
+    packages=[
+        "kandinsky3",
+        "kandinsky3/model"
+    ],
+    install_requires=[
+                    "timm",
+                    "torch==1.10.1+cu111",
+                    "torchvision==0.11.2+cu111",
+                    "torchaudio==0.10.1",
+                    "pytorch_lightning==1.7.5",
+                    "transformers",
+                    "accelerate",
+                    "diffusers",
+                    "setuptools==59.5.0",
+                    "omegaconf",
+                    "datasets",
+                    "einops",
+                    "webdataset",
+                    "fsspec",
+                    "s3fs",
+                    "hydra-core",
+                    "scikit-image",
+                    "matplotlib",
+                    "wandb",
+                    "albumentations",
+                    "bezier",
+                    "scipy",
+                    "Pillow",
+                    "tqdm",
+                    "huggingface_hub"
+    ],
+    author="",
+)

kandinsky3/t2i_pipeline.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from typing import Union, List
+import PIL
+import torch
+import torchvision.transforms as T
+from einops import repeat
+from kandinsky3.model.unet import UNet
+from kandinsky3.movq import MoVQ
+from kandinsky3.condition_encoders import T5TextConditionEncoder
+from kandinsky3.condition_processors import T5TextConditionProcessor
+from kandinsky3.model.diffusion import BaseDiffusion, get_named_beta_schedule
+class Kandinsky3T2IPipeline:
+    def __init__(
+            self,
+            device_map: Union[str, torch.device, dict],
+            dtype_map: Union[str, torch.dtype, dict],
+            unet: UNet,
+            null_embedding: torch.Tensor,
+            t5_processor: T5TextConditionProcessor,
+            t5_encoder: T5TextConditionEncoder,
+            movq: MoVQ,
+            gan: bool,
+    ):
+        self.device_map = device_map
+        self.dtype_map = dtype_map
+        self.to_pil = T.ToPILImage()
+        self.unet = unet
+        self.null_embedding = null_embedding
+        self.t5_processor = t5_processor
+        self.t5_encoder = t5_encoder
+        self.movq = movq
+        self.gan = gan
+    def __call__(
+            self,
+            text: str,
+            negative_text: str = None,
+            images_num: int = 1,
+            bs: int = 1,
+            width: int = 1024,
+            height: int = 1024,
+            guidance_scale: float = 3.0,
+            steps: int = 50,
+            eta: float = 1.0
+    ) -> List[PIL.Image.Image]:
+        betas = get_named_beta_schedule('cosine', 1000)
+        base_diffusion = BaseDiffusion(betas, 0.99)
+        times = list(range(999, 0, -1000 // steps))
+        if self.gan:
+            times = list(range(979, 0, -250))
+        condition_model_input, negative_condition_model_input = self.t5_processor.encode(text, negative_text)
+        for input_type in condition_model_input:
+            condition_model_input[input_type] = condition_model_input[input_type][None].to(
+                self.device_map['text_encoder']
+            )
+        if negative_condition_model_input is not None:
+            for input_type in negative_condition_model_input:
+                negative_condition_model_input[input_type] = negative_condition_model_input[input_type][None].to(
+                    self.device_map['text_encoder']
+                )
+        pil_images = []
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(dtype=self.dtype_map['text_encoder']):
+                context, context_mask = self.t5_encoder(condition_model_input)
+                if negative_condition_model_input is not None:
+                    negative_context, negative_context_mask = self.t5_encoder(negative_condition_model_input)
+                else:
+                    negative_context, negative_context_mask = None, None
+            k, m = images_num // bs, images_num % bs
+            for minibatch in [bs] * k + [m]:
+                if minibatch == 0:
+                    continue
+                bs_context = repeat(context, '1 n d -> b n d', b=minibatch)
+                bs_context_mask = repeat(context_mask, '1 n -> b n', b=minibatch)
+                if negative_context is not None:
+                    bs_negative_context = repeat(negative_context, '1 n d -> b n d', b=minibatch)
+                    bs_negative_context_mask = repeat(negative_context_mask, '1 n -> b n', b=minibatch)
+                else:
+                    bs_negative_context, bs_negative_context_mask = None, None
+                with torch.cuda.amp.autocast(dtype=self.dtype_map['unet']):
+                    images = base_diffusion.p_sample_loop(
+                        self.unet, (minibatch, 4, height // 8, width // 8), times, self.device_map['unet'],
+                        bs_context, bs_context_mask, self.null_embedding, guidance_scale, eta,
+                        negative_context=bs_negative_context, negative_context_mask=bs_negative_context_mask,
+                        gan=self.gan
+                    )
+                with torch.cuda.amp.autocast(dtype=self.dtype_map['movq']):
+                    images = torch.cat([self.movq.decode(image) for image in images.chunk(2)])
+                    # print(torch.max(images), torch.min(images))
+                    images = torch.clip((images + 1.) / 2., 0., 1.)
+                    # print(torch.max(images), torch.min(images))
+                    # raise
+                    for images_chunk in images.chunk(1):
+                        pil_images += [self.to_pil(image) for image in images_chunk]
+        return pil_images

kandinsky3/utils.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from omegaconf import OmegaConf
+import numpy as np
+from scipy import ndimage
+import torch.nn as nn
+from skimage.transform import resize
+def load_conf(config_path):
+    conf = OmegaConf.load(config_path)
+    conf.data.tokens_length = conf.common.tokens_length
+    conf.data.processor_names = conf.model.encoders.model_names
+    conf.data.dataset.seed = conf.common.seed
+    conf.data.dataset.image_size = conf.common.image_size
+    conf.trainer.trainer_params.max_steps = conf.common.train_steps
+    conf.scheduler.params.total_steps = conf.common.train_steps
+    conf.logger.tensorboard.name = conf.common.experiment_name
+    conf.model.encoders.context_dim = conf.model.unet_params.context_dim
+    return conf
+def freeze(model):
+    for p in model.parameters():
+        p.requires_grad = False
+    return model
+def unfreeze(model):
+    for p in model.parameters():
+        p.requires_grad = True
+    return model
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
+def resize_mask_for_diffusion(mask):
+    reduce_factor = max(1, (mask.size / 1024**2)**0.5)
+    resized_mask = resize(
+        mask,
+        (
+            (round(mask.shape[0] / reduce_factor) // 64) * 64,
+            (round(mask.shape[1] / reduce_factor) // 64) * 64
+        ),
+        preserve_range=True,
+        anti_aliasing=False
+    )
+    return resized_mask
+def resize_image_for_diffusion(image):
+    reduce_factor = max(1, (image.size[0] * image.size[1] / 1024**2)**0.5)
+    image = image.resize((
+        (round(image.size[0] / reduce_factor) // 64) * 64, (round(image.size[1] / reduce_factor) // 64) * 64
+    ))
+    return image
+def prepare_mask(mask):
+    ker = np.array([[1, 1,  1, 1, 1],
+        [1, 5,  5, 5, 1],
+        [1, 5, 44, 5, 1],
+        [1, 5,  5, 5, 1],
+        [1, 1,  1, 1, 1]]) / 100
+    out = ndimage.convolve(mask, ker)
+    out = ndimage.convolve(out, ker)
+    out = ndimage.convolve(out, ker)
+    mask = (out > 0).astype(int)
+    return mask

unet_model_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4165a1e48f2a7630729c03c8c7662b6acf8b9f6a590102ba80051c01afb480eb
+size 12154895798