Spaces:

XavierJiezou
/

face-mogle

Runtime error

App Files Files Community

XavierJiezou commited on Apr 12

Commit

48ed5ae

verified ·

1 Parent(s): 3f02207

Upload folder using huggingface_hub

Browse files

Files changed (27) hide show

LICENSE +201 -0
app.py +821 -0
assets/mask2face/black_seg.png +0 -0
assets/mask2face/handou_seg.png +0 -0
assets/multimodal/liuyifei_seg.png +0 -0
assets/multimodal/musk_seg.png +0 -0
config/Face-MoGLE.yaml +40 -0
requirements.txt +17 -0
src/flux/__pycache__/block.cpython-311.pyc +0 -0
src/flux/__pycache__/condition.cpython-311.pyc +0 -0
src/flux/__pycache__/generate.cpython-311.pyc +0 -0
src/flux/__pycache__/lora_controller.cpython-311.pyc +0 -0
src/flux/__pycache__/pipeline_tools.cpython-311.pyc +0 -0
src/flux/__pycache__/transformer.cpython-311.pyc +0 -0
src/flux/block.py +345 -0
src/flux/condition.py +129 -0
src/flux/generate.py +316 -0
src/flux/lora_controller.py +75 -0
src/flux/pipeline_tools.py +52 -0
src/flux/transformer.py +257 -0
src/moe/__pycache__/mogle.cpython-311.pyc +0 -0
src/moe/mogle.py +140 -0
src/train/callbacks.py +170 -0
src/train/data.py +98 -0
src/train/model.py +201 -0
weights/mogle.pt +3 -0
weights/pytorch_lora_weights.safetensors +3 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2024] [Zhenxiong Tan]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

app.py ADDED Viewed

	@@ -0,0 +1,821 @@

+import gradio as gr
+from PIL import Image
+import torch
+import yaml
+import numpy as np
+from torchvision.models import convnext_base, convnext_small
+from torch import nn as nn
+import facer
+from torch import Tensor
+import math
+from typing import Any, Optional, Tuple, Type
+from torch.nn import functional as F
+import torchvision
+from torchvision import transforms as T
+from src.flux.generate import generate
+from diffusers.pipelines import FluxPipeline
+from src.flux.condition import Condition
+from src.moe.mogle import MoGLE
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class MLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        sigmoid_output: bool = False,
+    ) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+        self.sigmoid_output = sigmoid_output
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        if self.sigmoid_output:
+            x = F.sigmoid(x)
+        return x
+class FaceDecoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        transformer_dim: 256,
+        transformer: nn.Module,
+        activation: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.transformer_dim = transformer_dim
+        self.transformer = transformer
+        self.background_token = nn.Embedding(1, transformer_dim)
+        self.neck_token = nn.Embedding(1, transformer_dim)
+        self.face_token = nn.Embedding(1, transformer_dim)
+        self.cloth_token = nn.Embedding(1, transformer_dim)
+        self.rightear_token = nn.Embedding(1, transformer_dim)
+        self.leftear_token = nn.Embedding(1, transformer_dim)
+        self.rightbro_token = nn.Embedding(1, transformer_dim)
+        self.leftbro_token = nn.Embedding(1, transformer_dim)
+        self.righteye_token = nn.Embedding(1, transformer_dim)
+        self.lefteye_token = nn.Embedding(1, transformer_dim)
+        self.nose_token = nn.Embedding(1, transformer_dim)
+        self.innermouth_token = nn.Embedding(1, transformer_dim)
+        self.lowerlip_token = nn.Embedding(1, transformer_dim)
+        self.upperlip_token = nn.Embedding(1, transformer_dim)
+        self.hair_token = nn.Embedding(1, transformer_dim)
+        self.glass_token = nn.Embedding(1, transformer_dim)
+        self.hat_token = nn.Embedding(1, transformer_dim)
+        self.earring_token = nn.Embedding(1, transformer_dim)
+        self.necklace_token = nn.Embedding(1, transformer_dim)
+        self.output_upscaling = nn.Sequential(
+            nn.ConvTranspose2d(
+                transformer_dim, transformer_dim // 4, kernel_size=2, stride=2
+            ),
+            LayerNorm2d(transformer_dim // 4),
+            activation(),
+            nn.ConvTranspose2d(
+                transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2
+            ),
+            activation(),
+        )
+        self.output_hypernetwork_mlps = MLP(
+            transformer_dim, transformer_dim, transformer_dim // 8, 3
+        )
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        image_embeddings - torch.Size([1, 256, 128, 128])
+        image_pe - torch.Size([1, 256, 128, 128])
+        """
+        output_tokens = torch.cat(
+            [
+                self.background_token.weight,
+                self.neck_token.weight,
+                self.face_token.weight,
+                self.cloth_token.weight,
+                self.rightear_token.weight,
+                self.leftear_token.weight,
+                self.rightbro_token.weight,
+                self.leftbro_token.weight,
+                self.righteye_token.weight,
+                self.lefteye_token.weight,
+                self.nose_token.weight,
+                self.innermouth_token.weight,
+                self.lowerlip_token.weight,
+                self.upperlip_token.weight,
+                self.hair_token.weight,
+                self.glass_token.weight,
+                self.hat_token.weight,
+                self.earring_token.weight,
+                self.necklace_token.weight,
+            ],
+            dim=0,
+        )
+        tokens = output_tokens.unsqueeze(0).expand(
+            image_embeddings.size(0), -1, -1
+        )  ##### torch.Size([4, 11, 256])
+        src = image_embeddings  ##### torch.Size([4, 256, 128, 128])
+        pos_src = image_pe.expand(image_embeddings.size(0), -1, -1, -1)
+        b, c, h, w = src.shape
+        # Run the transformer
+        hs, src = self.transformer(
+            src, pos_src, tokens
+        )  ####### hs - torch.Size([BS, 11, 256]), src - torch.Size([BS, 16348, 256])
+        mask_token_out = hs[:, :, :]
+        src = src.transpose(1, 2).view(b, c, h, w)  ##### torch.Size([4, 256, 128, 128])
+        upscaled_embedding = self.output_upscaling(
+            src
+        )  ##### torch.Size([4, 32, 512, 512])
+        hyper_in = self.output_hypernetwork_mlps(
+            mask_token_out
+        )  ##### torch.Size([1, 11, 32])
+        b, c, h, w = upscaled_embedding.shape
+        seg_output = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(
+            b, -1, h, w
+        )  ##### torch.Size([1, 11, 512, 512])
+        return seg_output
+class PositionEmbeddingRandom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+    def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
+        super().__init__()
+        if scale is None or scale <= 0.0:
+            scale = 1.0
+        self.register_buffer(
+            "positional_encoding_gaussian_matrix",
+            scale * torch.randn((2, num_pos_feats)),
+        )
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w = size
+        device: Any = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones((h, w), device=device, dtype=torch.float32)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+        return pe.permute(2, 0, 1)  # C x H x W
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C
+class TwoWayTransformer(nn.Module):
+    def __init__(
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+    ) -> None:
+        """
+        A transformer decoder that attends to an input image using
+        queries whose positional embedding is supplied.
+        Args:
+          depth (int): number of layers in the transformer
+          embedding_dim (int): the channel dimension for the input embeddings
+          num_heads (int): the number of heads for multihead attention. Must
+            divide embedding_dim
+          mlp_dim (int): the channel dimension internal to the MLP block
+          activation (nn.Module): the activation to use in the MLP block
+        """
+        super().__init__()
+        self.depth = depth
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.mlp_dim = mlp_dim
+        self.layers = nn.ModuleList()
+        for i in range(depth):
+            self.layers.append(
+                TwoWayAttentionBlock(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    mlp_dim=mlp_dim,
+                    activation=activation,
+                    attention_downsample_rate=attention_downsample_rate,
+                    skip_first_layer_pe=(i == 0),
+                )
+            )
+        self.final_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm_final_attn = nn.LayerNorm(embedding_dim)
+    def forward(
+        self,
+        image_embedding: Tensor,
+        image_pe: Tensor,
+        point_embedding: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+          image_embedding (torch.Tensor): image to attend to. Should be shape
+            B x embedding_dim x h x w for any h and w.
+          image_pe (torch.Tensor): the positional encoding to add to the image. Must
+            have the same shape as image_embedding.
+          point_embedding (torch.Tensor): the embedding to add to the query points.
+            Must have shape B x N_points x embedding_dim for any N_points.
+        Returns:
+          torch.Tensor: the processed point_embedding
+          torch.Tensor: the processed image_embedding
+        """
+        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+        bs, c, h, w = image_embedding.shape
+        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+        # Prepare queries
+        queries = point_embedding
+        keys = image_embedding
+        # Apply transformer blocks and final layernorm
+        for layer in self.layers:
+            queries, keys = layer(
+                queries=queries,
+                keys=keys,
+                query_pe=point_embedding,
+                key_pe=image_pe,
+            )
+        # Apply the final attention layer from the points to the image
+        q = queries + point_embedding
+        k = keys + image_pe
+        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm_final_attn(queries)
+        return queries, keys
+class MLPBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(x)))
+class TwoWayAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int = 2048,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
+        """
+        A transformer block with four layers: (1) self-attention of sparse
+        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+        block on sparse inputs, and (4) cross attention of dense inputs to sparse
+        inputs.
+        Arguments:
+          embedding_dim (int): the channel dimension of the embeddings
+          num_heads (int): the number of heads in the attention layers
+          mlp_dim (int): the hidden dimension of the mlp block
+          activation (nn.Module): the activation of the mlp block
+          skip_first_layer_pe (bool): skip the PE on the first layer
+        """
+        super().__init__()
+        self.self_attn = Attention(embedding_dim, num_heads)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm2 = nn.LayerNorm(embedding_dim)
+        self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
+        self.norm3 = nn.LayerNorm(embedding_dim)
+        self.norm4 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_image_to_token = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.skip_first_layer_pe = skip_first_layer_pe
+    def forward(
+        self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        # Self attention block
+        if self.skip_first_layer_pe:
+            queries = self.self_attn(q=queries, k=queries, v=queries)
+        else:
+            q = queries + query_pe
+            attn_out = self.self_attn(q=q, k=q, v=queries)
+            queries = queries + attn_out
+        queries = self.norm1(queries)
+        # Cross attention block, tokens attending to image embedding
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm2(queries)
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.norm3(queries)
+        # Cross attention block, image embedding attending to tokens
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+        keys = keys + attn_out
+        keys = self.norm4(keys)
+        return queries, keys
+class Attention(nn.Module):
+    """
+    An attention layer that allows for downscaling the size of the embedding
+    after projection to queries, keys, and values.
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        downsample_rate: int = 1,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.internal_dim = embedding_dim // downsample_rate
+        self.num_heads = num_heads
+        assert (
+            self.internal_dim % num_heads == 0
+        ), "num_heads must divide embedding_dim."
+        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+        b, n, c = x.shape
+        x = x.reshape(b, n, num_heads, c // num_heads)
+        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+    def _recombine_heads(self, x: Tensor) -> Tensor:
+        b, n_heads, n_tokens, c_per_head = x.shape
+        x = x.transpose(1, 2)
+        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+        # Attention
+        _, _, _, c_per_head = q.shape
+        attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
+        attn = attn / math.sqrt(c_per_head)
+        attn = torch.softmax(attn, dim=-1)
+        # Get output
+        out = attn @ v
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+        return out
+class SegfaceMLP(nn.Module):
+    """
+    Linear Embedding.
+    """
+    def __init__(self, input_dim):
+        super().__init__()
+        self.proj = nn.Linear(input_dim, 256)
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        hidden_states = self.proj(hidden_states)
+        return hidden_states
+class SegFaceCeleb(nn.Module):
+    def __init__(self, input_resolution, model):
+        super(SegFaceCeleb, self).__init__()
+        self.input_resolution = input_resolution
+        self.model = model
+        if self.model == "convnext_base":
+            convnext = convnext_base(pretrained=False)
+            self.backbone = torch.nn.Sequential(*(list(convnext.children())[:-1]))
+            self.target_layer_names = ["0.1", "0.3", "0.5", "0.7"]
+            self.multi_scale_features = []
+        if self.model == "convnext_small":
+            convnext = convnext_small(pretrained=False)
+            self.backbone = torch.nn.Sequential(*(list(convnext.children())[:-1]))
+            self.target_layer_names = ["0.1", "0.3", "0.5", "0.7"]
+            self.multi_scale_features = []
+        if self.model == "convnext_tiny":
+            convnext = convnext_small(pretrained=False)
+            self.backbone = torch.nn.Sequential(*(list(convnext.children())[:-1]))
+            self.target_layer_names = ["0.1", "0.3", "0.5", "0.7"]
+            self.multi_scale_features = []
+        embed_dim = 1024
+        out_chans = 256
+        self.pe_layer = PositionEmbeddingRandom(out_chans // 2)
+        for name, module in self.backbone.named_modules():
+            if name in self.target_layer_names:
+                module.register_forward_hook(self.save_features_hook(name))
+        self.face_decoder = FaceDecoder(
+            transformer_dim=256,
+            transformer=TwoWayTransformer(
+                depth=2,
+                embedding_dim=256,
+                mlp_dim=2048,
+                num_heads=8,
+            ),
+        )
+        num_encoder_blocks = 4
+        if self.model in ["swin_base", "swinv2_base", "convnext_base"]:
+            hidden_sizes = [128, 256, 512, 1024]  ### Swin Base and ConvNext Base
+        if self.model in ["resnet"]:
+            hidden_sizes = [256, 512, 1024, 2048]  ### ResNet
+        if self.model in [
+            "swinv2_small",
+            "swinv2_tiny",
+            "convnext_small",
+            "convnext_tiny",
+        ]:
+            hidden_sizes = [
+                96,
+                192,
+                384,
+                768,
+            ]  ### Swin Small/Tiny and ConvNext Small/Tiny
+        if self.model in ["mobilenet"]:
+            hidden_sizes = [24, 40, 112, 960]  ### MobileNet
+        if self.model in ["efficientnet"]:
+            hidden_sizes = [48, 80, 176, 1280]  ### EfficientNet
+        decoder_hidden_size = 256
+        mlps = []
+        for i in range(num_encoder_blocks):
+            mlp = SegfaceMLP(input_dim=hidden_sizes[i])
+            mlps.append(mlp)
+        self.linear_c = nn.ModuleList(mlps)
+        # The following 3 layers implement the ConvModule of the original implementation
+        self.linear_fuse = nn.Conv2d(
+            in_channels=decoder_hidden_size * num_encoder_blocks,
+            out_channels=decoder_hidden_size,
+            kernel_size=1,
+            bias=False,
+        )
+    def save_features_hook(self, name):
+        def hook(module, input, output):
+            if self.model in [
+                "swin_base",
+                "swinv2_base",
+                "swinv2_small",
+                "swinv2_tiny",
+            ]:
+                self.multi_scale_features.append(
+                    output.permute(0, 3, 1, 2).contiguous()
+                )  ### Swin, Swinv2
+            if self.model in [
+                "convnext_base",
+                "convnext_small",
+                "convnext_tiny",
+                "mobilenet",
+                "efficientnet",
+            ]:
+                self.multi_scale_features.append(
+                    output
+                )  ### ConvNext, ResNet, EfficientNet, MobileNet
+        return hook
+    def forward(self, x):
+        self.multi_scale_features.clear()
+        _, _, h, w = x.shape
+        features = self.backbone(x).squeeze()
+        batch_size = self.multi_scale_features[-1].shape[0]
+        all_hidden_states = ()
+        for encoder_hidden_state, mlp in zip(self.multi_scale_features, self.linear_c):
+            height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3]
+            encoder_hidden_state = mlp(encoder_hidden_state)
+            encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1)
+            encoder_hidden_state = encoder_hidden_state.reshape(
+                batch_size, -1, height, width
+            )
+            # upsample
+            encoder_hidden_state = nn.functional.interpolate(
+                encoder_hidden_state,
+                size=self.multi_scale_features[0].size()[2:],
+                mode="bilinear",
+                align_corners=False,
+            )
+            all_hidden_states += (encoder_hidden_state,)
+        fused_states = self.linear_fuse(
+            torch.cat(all_hidden_states[::-1], dim=1)
+        )  #### torch.Size([BS, 256, 128, 128])
+        image_pe = self.pe_layer(
+            (fused_states.shape[2], fused_states.shape[3])
+        ).unsqueeze(0)
+        seg_output = self.face_decoder(image_embeddings=fused_states, image_pe=image_pe)
+        return seg_output
+# 模型和配置初始化封装类
+class ImageGenerator:
+    def __init__(self):
+        self.args = self.get_args()
+        self.pipeline, self.moe_model = self.get_model(self.args)
+        with open(self.args.config_path, "r") as f:
+            self.model_config = yaml.safe_load(f)["model"]
+        self.farl = facer.face_parser(
+            "farl/celebm/448",
+            self.args.device,
+            model_path="https://github.com/FacePerceiver/facer/releases/download/models-v1/face_parsing.farl.celebm.main_ema_181500_jit.pt",
+        )
+        self.segface = SegFaceCeleb(512, "convnext_base").to(self.args.device)
+        checkpoint = torch.hub.load_state_dict_from_url("https://huggingface.co/kartiknarayan/SegFace/resolve/main/convnext_celeba_512/model_299.pt")
+        self.segface.load_state_dict(checkpoint["state_dict_backbone"])
+        self.segface.eval()
+        self.segface_transforms = torchvision.transforms.Compose(
+            [
+                torchvision.transforms.ToTensor(),
+                torchvision.transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        self.seg_face_remap_dict = {
+            0: 0, 1: 17, 2: 1, 3: 18, 4: 9, 5: 8, 6: 7, 7: 6,
+            8: 5, 9: 4, 10: 2, 11: 10, 12: 12, 13: 11, 14: 13,
+            15: 3, 16: 14, 17: 15, 18: 16,
+        }
+        self.palette = np.array(
+            [
+                (0, 0, 0), (204, 0, 0), (76, 153, 0), (204, 204, 0),
+                (204, 0, 204), (51, 51, 255), (255, 204, 204), (0, 255, 255),
+                (255, 0, 0), (102, 51, 0), (102, 204, 0), (255, 255, 0),
+                (0, 0, 153), (0, 0, 204), (255, 51, 153), (0, 204, 204),
+                (0, 51, 0), (255, 153, 51), (0, 204, 0),
+            ],
+            dtype=np.uint8,
+        )
+        self.org_labels = [
+            "background", "face", "nose", "eyeg", "le", "re", "lb", "rb",
+            "lr", "rr", "imouth", "ulip", "llip", "hair", "hat", "earr",
+            "neck_l", "neck", "cloth",
+        ]
+        self.new_labels = [
+            "background", "neck", "face", "cloth", "rr", "lr", "rb", "lb",
+            "re", "le", "nose", "imouth", "llip", "ulip", "hair", "eyeg",
+            "hat", "earr", "neck_l",
+        ]
+    @torch.no_grad()
+    def parse_face_with_farl(self, image):
+        image = image.resize((512, 512), Image.BICUBIC)
+        image_np = np.array(image)
+        image_pt = torch.tensor(image_np).to(self.args.device)
+        image_pt = image_pt.permute(2, 0, 1).unsqueeze(0).float()
+        pred, _ = self.farl.net(image_pt / 255.0)
+        vis_seg_probs = pred.argmax(dim=1).detach().cpu().numpy()[0].astype(np.uint8)
+        remapped_mask = np.zeros_like(vis_seg_probs, dtype=np.uint8)
+        for i, pred_label in enumerate(self.new_labels):
+            if pred_label in self.org_labels:
+                remapped_mask[vis_seg_probs == i] = self.org_labels.index(pred_label)
+        vis_seg_probs = Image.fromarray(remapped_mask).convert("P")
+        vis_seg_probs.putpalette(self.palette.flatten())
+        return vis_seg_probs
+    @torch.no_grad()
+    def parse_face_with_segface(self, image):
+        image = image.resize((512, 512), Image.BICUBIC)
+        image = self.segface_transforms(image)
+        logits = self.segface(image.unsqueeze(0).to(self.args.device))
+        vis_seg_probs = logits.argmax(dim=1).detach().cpu().numpy()[0].astype(np.uint8)
+        new_mask = np.zeros_like(vis_seg_probs)
+        for old_idx, new_idx in self.seg_face_remap_dict.items():
+            new_mask[vis_seg_probs == old_idx] = new_idx
+        vis_seg_probs = Image.fromarray(new_mask).convert("P")
+        vis_seg_probs.putpalette(self.palette.flatten())
+        return vis_seg_probs
+    def get_args(self):
+        class Args:
+            pipe = "black-forest-labs/FLUX.1-dev"
+            lora_ckpt = "weights"
+            moe_ckpt = "weights/mogle.pt"
+            pretrained_ckpt = "weights/FLUX.1-dev"
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            size = 512
+            seed = 42
+            config_path = "config/Face-MoGLE.yaml"
+        return Args()
+    def get_model(self, args):
+        pipeline = FluxPipeline.from_pretrained(
+            args.pretrained_ckpt, torch_dtype=torch.bfloat16
+        )
+        pipeline.load_lora_weights(args.lora_ckpt, weight_name=f"pytorch_lora_weights.safetensors",)
+        pipeline.to(args.device)
+        moe_model = MoGLE()
+        moe_weight = torch.load(args.moe_ckpt, map_location="cpu")
+        moe_model.load_state_dict(moe_weight, strict=True)
+        moe_model = moe_model.to(device=args.device, dtype=torch.bfloat16)
+        moe_model.eval()
+        return pipeline, moe_model
+    def pack_data(self, mask_image: Image.Image):
+        mask = np.array(mask_image.convert("L"))
+        mask_list = [T.ToTensor()(mask_image.convert("RGB"))]
+        for i in range(19):
+            local_mask = np.zeros_like(mask)
+            local_mask[mask == i] = 255
+            local_mask_tensor = T.ToTensor()(Image.fromarray(local_mask).convert("RGB"))
+            mask_list.append(local_mask_tensor)
+        condition_img = torch.stack(mask_list, dim=0)
+        return Condition(condition_type="depth", condition=condition_img, position_delta=[0, 0])
+    def generate(self, prompt: str, mask_image: Image.Image, seed: int, num_inference_steps=28):
+        generator = torch.Generator().manual_seed(seed)
+        condition = self.pack_data(mask_image)
+        result = generate(
+            self.pipeline,
+            mogle=self.moe_model,
+            prompt=prompt,
+            conditions=[condition],
+            height=self.args.size,
+            width=self.args.size,
+            generator=generator,
+            model_config=self.model_config,
+            default_lora=True,
+            num_inference_steps=num_inference_steps
+        )
+        return result.images[0]
+# 实例化生成器
+generator = ImageGenerator()
+examples = [
+["", "assets/mask2face/handou_seg.png", None, "FaRL", 42, 28],
+["", "assets/mask2face/black_seg.png", None, "FaRL", 42, 28],
+["She has red hair", "assets/multimodal/liuyifei_seg.png", None, "FaRL", 42, 28],
+["He is old", "assets/multimodal/musk_seg.png", None, "FaRL", 42, 28],
+["Curly-haired woman with glasses", None, None, "FaRL", 42, 28],
+["Man with beard and tie", None, None, "FaRL", 42, 28],
+]
+# Gradio 界面（使用 Blocks）
+with gr.Blocks(title="Controllable Face Generation with MoGLE") as demo:
+    gr.Markdown("## 🎭 Controllable Face Generation via Prompt + Face Parsing")
+    with gr.Row():
+        prompt = gr.Textbox(label="Text Prompt", placeholder="Describe the face you'd like to generate...")
+    with gr.Row():
+        with gr.Column():
+            mask_image = gr.Image(type="pil", label="🧩 Segmantic Mask (Optional)")
+            rgb_image = gr.Image(type="pil", label="🖼️ Facial Image (Optional)")
+            model_choice = gr.Radio(["FaRL", "SegFace"], label="Face Parsing Model", value="FaRL")
+            seed = gr.Slider(minimum=0, maximum=100000, step=1, value=42, label="Random Seed")
+            num_inference_steps = gr.Slider(minimum=1, maximum=100, step=1, value=28, label="Sampling Step")
+            submit_btn = gr.Button("Generate")
+        with gr.Column():
+            gr.Markdown("### 🧠 Parsed Mask Preview")
+            preview_mask = gr.Image(label="Parsed Mask (from RGB)", interactive=False)
+            output_image = gr.Image(label="🎨 Generated Image")
+    def generate_wrapper(prompt, mask_image, rgb_image, model_choice, seed,num_inference_steps):
+        if mask_image is None and rgb_image is not None:
+            if model_choice == "FaRL":
+                mask_image = generator.parse_face_with_farl(rgb_image)
+            else:
+                mask_image = generator.parse_face_with_segface(rgb_image)
+        elif mask_image is None and rgb_image is None:
+            # raise gr.Error("请上传至少一个：语义分割图 或 RGB 人脸图像。")
+            mask_image = Image.new("RGB", size=(512, 512))
+        return mask_image, generator.generate(prompt, mask_image, seed,num_inference_steps)
+    submit_btn.click(
+        fn=generate_wrapper,
+        inputs=[prompt, mask_image, rgb_image, model_choice, seed,num_inference_steps],
+        outputs=[preview_mask, output_image]
+    )
+    gr.Examples(
+        examples=examples,
+        inputs=[prompt, mask_image, rgb_image, model_choice, seed, num_inference_steps],
+        outputs=[preview_mask, output_image],
+        fn=lambda *args: generate_wrapper(*args),  # 直接引用已定义的函数
+        cache_examples=False,
+        label="Click any example below to try:"
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=5000, share=False)

assets/mask2face/black_seg.png ADDED Viewed

assets/mask2face/handou_seg.png ADDED Viewed

assets/multimodal/liuyifei_seg.png ADDED Viewed

assets/multimodal/musk_seg.png ADDED Viewed

config/Face-MoGLE.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+# flux_path: "black-forest-labs/FLUX.1-dev"
+sd_path: "checkpoints/FLUX.1-dev"
+dtype: "bfloat16"
+model:
+  union_cond_attn: true
+  add_cond_attn: false
+  latent_lora: false
+train:
+  batch_size: 4
+  accumulate_grad_batches: 1
+  dataloader_workers: 4
+  save_interval: 1000
+  sample_interval: 100
+  max_steps: -1
+  gradient_checkpointing: true
+  save_path: "runs/face-mogle"
+  condition_type: "depth"
+  dataset:
+    root: "data/mmcelebahq"
+    condition_size: 512
+    target_size: 512
+    drop_text_prob: 0.1
+    drop_image_prob: 0.1
+  lora_config:
+    r: 4
+    lora_alpha: 4
+    init_lora_weights: "gaussian"
+    target_modules: "(.*x_embedder|.*(?<!single_)transformer_blocks\\.[0-9]+\\.norm1\\.linear|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_k|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_q|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_v|.*(?<!single_)transformer_blocks\\.[0-9]+\\.attn\\.to_out\\.0|.*(?<!single_)transformer_blocks\\.[0-9]+\\.ff\\.net\\.2|.*single_transformer_blocks\\.[0-9]+\\.norm\\.linear|.*single_transformer_blocks\\.[0-9]+\\.proj_mlp|.*single_transformer_blocks\\.[0-9]+\\.proj_out|.*single_transformer_blocks\\.[0-9]+\\.attn.to_k|.*single_transformer_blocks\\.[0-9]+\\.attn.to_q|.*single_transformer_blocks\\.[0-9]+\\.attn.to_v|.*single_transformer_blocks\\.[0-9]+\\.attn.to_out)"
+  optimizer:
+    type: "Prodigy"
+    params:
+      lr: 1
+      use_bias_correction: true
+      safeguard_warmup: true
+      weight_decay: 0.01

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+diffusers==0.31.0
+transformers
+peft
+opencv-python
+protobuf
+sentencepiece
+gradio
+jupyter
+torchao
+pyfacer
+yaml
+lightning
+datasets
+torchvision
+prodigyopt
+wandb

src/flux/__pycache__/block.cpython-311.pyc ADDED Viewed

Binary file (14.6 kB). View file

src/flux/__pycache__/condition.cpython-311.pyc ADDED Viewed

Binary file (5.74 kB). View file

src/flux/__pycache__/generate.cpython-311.pyc ADDED Viewed

Binary file (11.9 kB). View file

src/flux/__pycache__/lora_controller.cpython-311.pyc ADDED Viewed

Binary file (5.12 kB). View file

src/flux/__pycache__/pipeline_tools.cpython-311.pyc ADDED Viewed

Binary file (2.56 kB). View file

src/flux/__pycache__/transformer.cpython-311.pyc ADDED Viewed

Binary file (7.85 kB). View file

src/flux/block.py ADDED Viewed

	@@ -0,0 +1,345 @@

+import torch
+from typing import List, Union, Optional, Dict, Any, Callable
+from diffusers.models.attention_processor import Attention, F
+from .lora_controller import enable_lora
+def attn_forward(
+    attn: Attention,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: torch.FloatTensor = None,
+    condition_latents: torch.FloatTensor = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    image_rotary_emb: Optional[torch.Tensor] = None,
+    cond_rotary_emb: Optional[torch.Tensor] = None,
+    model_config: Optional[Dict[str, Any]] = {},
+) -> torch.FloatTensor:
+    batch_size, _, _ = (
+        hidden_states.shape
+        if encoder_hidden_states is None
+        else encoder_hidden_states.shape
+    )
+    with enable_lora(
+        (attn.to_q, attn.to_k, attn.to_v), model_config.get("latent_lora", False)
+    ):
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        # print(query.shape,key.shape,value.shape) torch.Size([2, 1024, 3072]) torch.Size([2, 1024, 3072]) torch.Size([2, 1024, 3072])
+    inner_dim = key.shape[-1]
+    head_dim = inner_dim // attn.heads
+    query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    if attn.norm_q is not None:
+        query = attn.norm_q(query)
+    if attn.norm_k is not None:
+        key = attn.norm_k(key)
+    # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+    if encoder_hidden_states is not None:
+        # print(hidden_states.shape,encoder_hidden_states.shape,condition_latents.shape) torch.Size([2, 1024, 3072]) torch.Size([2, 512, 3072]) torch.Size([2, 1024, 3072])
+        # `context` projections.
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(
+                encoder_hidden_states_query_proj
+            )
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(
+                encoder_hidden_states_key_proj
+            )
+        # attention
+        query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+        key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+        value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+    if image_rotary_emb is not None:
+        from diffusers.models.embeddings import apply_rotary_emb
+        query = apply_rotary_emb(query, image_rotary_emb)
+        key = apply_rotary_emb(key, image_rotary_emb)
+    if condition_latents is not None:
+        cond_query = attn.to_q(condition_latents)
+        cond_key = attn.to_k(condition_latents)
+        cond_value = attn.to_v(condition_latents)
+        cond_query = cond_query.view(batch_size, -1, attn.heads, head_dim).transpose(
+            1, 2
+        )
+        cond_key = cond_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        cond_value = cond_value.view(batch_size, -1, attn.heads, head_dim).transpose(
+            1, 2
+        )
+        if attn.norm_q is not None:
+            cond_query = attn.norm_q(cond_query)
+        if attn.norm_k is not None:
+            cond_key = attn.norm_k(cond_key)
+    if cond_rotary_emb is not None:
+        cond_query = apply_rotary_emb(cond_query, cond_rotary_emb)
+        cond_key = apply_rotary_emb(cond_key, cond_rotary_emb)
+    if condition_latents is not None:
+        query = torch.cat([query, cond_query], dim=2)
+        key = torch.cat([key, cond_key], dim=2)
+        value = torch.cat([value, cond_value], dim=2)
+    if not model_config.get("union_cond_attn", True):
+        # If we don't want to use the union condition attention, we need to mask the attention
+        # between the hidden states and the condition latents
+        attention_mask = torch.ones(
+            query.shape[2], key.shape[2], device=query.device, dtype=torch.bool
+        )
+        condition_n = cond_query.shape[2]
+        attention_mask[-condition_n:, :-condition_n] = False
+        attention_mask[:-condition_n, -condition_n:] = False
+    elif model_config.get("independent_condition", False):
+        attention_mask = torch.ones(
+            query.shape[2], key.shape[2], device=query.device, dtype=torch.bool
+        )
+        condition_n = cond_query.shape[2]
+        attention_mask[-condition_n:, :-condition_n] = False
+    if hasattr(attn, "c_factor"):
+        attention_mask = torch.zeros(
+            query.shape[2], key.shape[2], device=query.device, dtype=query.dtype
+        )
+        condition_n = cond_query.shape[2]
+        bias = torch.log(attn.c_factor[0])
+        attention_mask[-condition_n:, :-condition_n] = bias
+        attention_mask[:-condition_n, -condition_n:] = bias
+    hidden_states = F.scaled_dot_product_attention(
+        query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attention_mask
+    )
+    hidden_states = hidden_states.transpose(1, 2).reshape(
+        batch_size, -1, attn.heads * head_dim
+    )
+    # print(f"hidden_states {hidden_states.shape}")
+    hidden_states = hidden_states.to(query.dtype)
+    if encoder_hidden_states is not None:
+        if condition_latents is not None:
+            encoder_hidden_states, hidden_states, condition_latents = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[
+                    :, encoder_hidden_states.shape[1] : -condition_latents.shape[1]
+                ],
+                hidden_states[:, -condition_latents.shape[1] :],
+            )
+        else:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+            )
+        with enable_lora((attn.to_out[0],), model_config.get("latent_lora", False)):
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        if condition_latents is not None:
+            condition_latents = attn.to_out[0](condition_latents)
+            condition_latents = attn.to_out[1](condition_latents)
+        return (
+            (hidden_states, encoder_hidden_states, condition_latents)
+            if condition_latents is not None
+            else (hidden_states, encoder_hidden_states)
+        )
+    elif condition_latents is not None:
+        # if there are condition_latents, we need to separate the hidden_states and the condition_latents
+        hidden_states, condition_latents = (
+            hidden_states[:, : -condition_latents.shape[1]],
+            hidden_states[:, -condition_latents.shape[1] :],
+        )
+        # print(hidden_states.shape,condition_latents.shape)    torch.Size([2, 1536, 3072]) torch.Size([2, 1024, 3072])
+        return hidden_states, condition_latents
+    else:
+        return hidden_states
+def block_forward(
+    self,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: torch.FloatTensor,
+    condition_latents: torch.FloatTensor,
+    temb: torch.FloatTensor,
+    cond_temb: torch.FloatTensor,
+    cond_rotary_emb=None,
+    image_rotary_emb=None,
+    model_config: Optional[Dict[str, Any]] = {},
+):
+    use_cond = condition_latents is not None
+    with enable_lora((self.norm1.linear,), model_config.get("latent_lora", False)):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+            hidden_states, emb=temb
+        )
+    norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = (
+        self.norm1_context(encoder_hidden_states, emb=temb)
+    )
+    # print(norm_encoder_hidden_states.shape,c_gate_msa.shape,c_shift_mlp.shape,c_scale_mlp.shape,c_gate_mlp.shape)
+    # torch.Size([2, 512, 3072]) torch.Size([2, 3072]) torch.Size([2, 3072]) torch.Size([2, 3072]) torch.Size([2, 3072])
+    if use_cond:
+        (
+            norm_condition_latents,
+            cond_gate_msa,
+            cond_shift_mlp,
+            cond_scale_mlp,
+            cond_gate_mlp,
+        ) = self.norm1(condition_latents, emb=cond_temb)
+    # Attention.
+    result = attn_forward(
+        self.attn,
+        model_config=model_config,
+        hidden_states=norm_hidden_states,
+        encoder_hidden_states=norm_encoder_hidden_states,
+        condition_latents=norm_condition_latents if use_cond else None,
+        image_rotary_emb=image_rotary_emb,
+        cond_rotary_emb=cond_rotary_emb if use_cond else None,
+    )
+    attn_output, context_attn_output = result[:2]
+    cond_attn_output = result[2] if use_cond else None
+    # Process attention outputs for the `hidden_states`.
+    # 1. hidden_states
+    attn_output = gate_msa.unsqueeze(1) * attn_output
+    # print(hidden_states.shape,attn_output.shape) torch.Size([2, 1024, 3072]) torch.Size([2, 1024, 3072])
+    hidden_states = hidden_states + attn_output
+    # 2. encoder_hidden_states
+    context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+    encoder_hidden_states = encoder_hidden_states + context_attn_output
+    # 3. condition_latents
+    if use_cond:
+        cond_attn_output = cond_gate_msa.unsqueeze(1) * cond_attn_output
+        condition_latents = condition_latents + cond_attn_output
+        if model_config.get("add_cond_attn", False):
+            hidden_states += cond_attn_output
+    # LayerNorm + MLP.
+    # 1. hidden_states
+    norm_hidden_states = self.norm2(hidden_states)
+    norm_hidden_states = (
+        norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+    )
+    # 2. encoder_hidden_states
+    norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+    norm_encoder_hidden_states = (
+        norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+    )
+    # 3. condition_latents
+    if use_cond:
+        norm_condition_latents = self.norm2(condition_latents)
+        norm_condition_latents = (
+            norm_condition_latents * (1 + cond_scale_mlp[:, None])
+            + cond_shift_mlp[:, None]
+        )
+    # Feed-forward.
+    with enable_lora((self.ff.net[2],), model_config.get("latent_lora", False)):
+        # 1. hidden_states
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+    # 2. encoder_hidden_states
+    context_ff_output = self.ff_context(norm_encoder_hidden_states)
+    context_ff_output = c_gate_mlp.unsqueeze(1) * context_ff_output
+    # 3. condition_latents
+    if use_cond:
+        cond_ff_output = self.ff(norm_condition_latents)
+        cond_ff_output = cond_gate_mlp.unsqueeze(1) * cond_ff_output
+    # Process feed-forward outputs.
+    hidden_states = hidden_states + ff_output
+    encoder_hidden_states = encoder_hidden_states + context_ff_output
+    if use_cond:
+        condition_latents = condition_latents + cond_ff_output
+    # Clip to avoid overflow.
+    if encoder_hidden_states.dtype == torch.float16:
+        encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+    return encoder_hidden_states, hidden_states, condition_latents if use_cond else None
+def single_block_forward(
+    self,
+    hidden_states: torch.FloatTensor,
+    temb: torch.FloatTensor,
+    image_rotary_emb=None,
+    condition_latents: torch.FloatTensor = None,
+    cond_temb: torch.FloatTensor = None,
+    cond_rotary_emb=None,
+    model_config: Optional[Dict[str, Any]] = {},
+):
+    using_cond = condition_latents is not None
+    residual = hidden_states
+    with enable_lora(
+        (
+            self.norm.linear,
+            self.proj_mlp,
+        ),
+        model_config.get("latent_lora", False),
+    ):
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+    if using_cond:
+        residual_cond = condition_latents
+        norm_condition_latents, cond_gate = self.norm(condition_latents, emb=cond_temb)
+        mlp_cond_hidden_states = self.act_mlp(self.proj_mlp(norm_condition_latents))
+    attn_output = attn_forward(
+        self.attn,
+        model_config=model_config,
+        hidden_states=norm_hidden_states,
+        image_rotary_emb=image_rotary_emb,
+        **(
+            {
+                "condition_latents": norm_condition_latents,
+                "cond_rotary_emb": cond_rotary_emb if using_cond else None,
+            }
+            if using_cond
+            else {}
+        ),
+    )
+    if using_cond:
+        attn_output, cond_attn_output = attn_output
+    with enable_lora((self.proj_out,), model_config.get("latent_lora", False)):
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+    if using_cond:
+        condition_latents = torch.cat([cond_attn_output, mlp_cond_hidden_states], dim=2)
+        cond_gate = cond_gate.unsqueeze(1)
+        condition_latents = cond_gate * self.proj_out(condition_latents)
+        condition_latents = residual_cond + condition_latents
+    if hidden_states.dtype == torch.float16:
+        hidden_states = hidden_states.clip(-65504, 65504)
+    return hidden_states if not using_cond else (hidden_states, condition_latents)

src/flux/condition.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+from typing import Optional, Union, List, Tuple
+from diffusers.pipelines import FluxPipeline
+from PIL import Image, ImageFilter
+import numpy as np
+import cv2
+from .pipeline_tools import encode_images
+condition_dict = {
+    "depth": 0,
+    "canny": 1,
+    "subject": 4,
+    "coloring": 6,
+    "deblurring": 7,
+    "depth_pred": 8,
+    "fill": 9,
+    "sr": 10,
+    "cartoon": 11,
+}
+class Condition(object):
+    def __init__(
+        self,
+        condition_type: str,
+        raw_img: Union[Image.Image, torch.Tensor] = None,
+        condition: Union[Image.Image, torch.Tensor] = None,
+        mask=None,
+        position_delta=None,
+        position_scale=1.0,
+    ) -> None:
+        self.condition_type = condition_type
+        assert raw_img is not None or condition is not None
+        if raw_img is not None:
+            self.condition = self.get_condition(condition_type, raw_img)
+        else:
+            self.condition = condition
+        self.position_delta = position_delta
+        self.position_scale = position_scale
+        # TODO: Add mask support
+        assert mask is None, "Mask not supported yet"
+    def get_condition(
+        self, condition_type: str, raw_img: Union[Image.Image, torch.Tensor]
+    ) -> Union[Image.Image, torch.Tensor]:
+        """
+        Returns the condition image.
+        """
+        if condition_type == "depth":
+            from transformers import pipeline
+            depth_pipe = pipeline(
+                task="depth-estimation",
+                model="LiheYoung/depth-anything-small-hf",
+                device="cuda",
+            )
+            source_image = raw_img.convert("RGB")
+            condition_img = depth_pipe(source_image)["depth"].convert("RGB")
+            return condition_img
+        elif condition_type == "canny":
+            img = np.array(raw_img)
+            edges = cv2.Canny(img, 100, 200)
+            edges = Image.fromarray(edges).convert("RGB")
+            return edges
+        elif condition_type == "subject":
+            return raw_img
+        elif condition_type == "coloring":
+            return raw_img.convert("L").convert("RGB")
+        elif condition_type == "deblurring":
+            condition_image = (
+                raw_img.convert("RGB")
+                .filter(ImageFilter.GaussianBlur(10))
+                .convert("RGB")
+            )
+            return condition_image
+        elif condition_type == "fill":
+            return raw_img.convert("RGB")
+        elif condition_type == "cartoon":
+            return raw_img.convert("RGB")
+        return self.condition
+    @property
+    def type_id(self) -> int:
+        """
+        Returns the type id of the condition.
+        """
+        return condition_dict[self.condition_type]
+    @classmethod
+    def get_type_id(cls, condition_type: str) -> int:
+        """
+        Returns the type id of the condition.
+        """
+        return condition_dict[condition_type]
+    def encode(self, pipe: FluxPipeline) -> Tuple[torch.Tensor, torch.Tensor, int]:
+        """
+        Encodes the condition into tokens, ids and type_id.
+        """
+        if self.condition_type in [
+            "depth",
+            "canny",
+            "subject",
+            "coloring",
+            "deblurring",
+            "depth_pred",
+            "fill",
+            "sr",
+            "cartoon",
+        ]:
+            tokens, ids = encode_images(pipe, self.condition)
+        else:
+            raise NotImplementedError(
+                f"Condition type {self.condition_type} not implemented"
+            )
+        if self.position_delta is None and self.condition_type == "subject":
+            self.position_delta = [0, -self.condition.size[0] // 16]
+        if self.position_delta is not None:
+            ids[:, 1] += self.position_delta[0]
+            ids[:, 2] += self.position_delta[1]
+        if self.position_scale != 1.0:
+            scale_bias = (self.position_scale - 1.0) / 2
+            ids[:, 1] *= self.position_scale
+            ids[:, 2] *= self.position_scale
+            ids[:, 1] += scale_bias
+            ids[:, 2] += scale_bias
+        type_id = torch.ones_like(ids[:, :1]) * self.type_id
+        return tokens, ids, type_id

src/flux/generate.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import torch
+import yaml, os
+from diffusers.pipelines import FluxPipeline, StableDiffusionPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    rescale_noise_cfg,
+)
+from diffusers.utils import deprecate, is_torch_xla_available
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    StableDiffusionPipelineOutput,
+)
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from torchvision import transforms as T
+from typing import List, Union, Optional, Dict, Any, Callable
+from .transformer import tranformer_forward
+from .condition import Condition
+from diffusers.pipelines.flux.pipeline_flux import (
+    FluxPipelineOutput,
+    calculate_shift,
+    retrieve_timesteps,
+    np,
+)
+def get_config(config_path: str = None):
+    config_path = config_path or os.environ.get("XFL_CONFIG")
+    if not config_path:
+        return {}
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+    return config
+def prepare_params(
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = 512,
+    width: Optional[int] = 512,
+    num_inference_steps: int = 28,
+    timesteps: List[int] = None,
+    guidance_scale: float = 3.5,
+    num_images_per_prompt: Optional[int] = 1,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    return_dict: bool = True,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 512,
+    **kwargs: dict,
+):
+    return (
+        prompt,
+        prompt_2,
+        height,
+        width,
+        num_inference_steps,
+        timesteps,
+        guidance_scale,
+        num_images_per_prompt,
+        generator,
+        latents,
+        prompt_embeds,
+        pooled_prompt_embeds,
+        output_type,
+        return_dict,
+        joint_attention_kwargs,
+        callback_on_step_end,
+        callback_on_step_end_tensor_inputs,
+        max_sequence_length,
+    )
+def seed_everything(seed: int = 42):
+    torch.backends.cudnn.deterministic = True
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+@torch.no_grad()
+def generate(
+    pipeline: FluxPipeline,
+    mogle=None,
+    conditions: List[Condition] = None,
+    config_path: str = None,
+    model_config: Optional[Dict[str, Any]] = {},
+    condition_scale: float = 1.0,
+    default_lora: bool = False,
+    **params: dict,
+):
+    model_config = model_config or get_config(config_path).get("model", {})
+    if condition_scale != 1:
+        for name, module in pipeline.transformer.named_modules():
+            if not name.endswith(".attn"):
+                continue
+            module.c_factor = torch.ones(1, 1) * condition_scale
+    self = pipeline
+    (
+        prompt,
+        prompt_2,
+        height,
+        width,
+        num_inference_steps,
+        timesteps,
+        guidance_scale,
+        num_images_per_prompt,
+        generator,
+        latents,
+        prompt_embeds,
+        pooled_prompt_embeds,
+        output_type,
+        return_dict,
+        joint_attention_kwargs,
+        callback_on_step_end,
+        callback_on_step_end_tensor_inputs,
+        max_sequence_length,
+    ) = prepare_params(**params)
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    # 1. Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        max_sequence_length=max_sequence_length,
+    )
+    self._guidance_scale = guidance_scale
+    self._joint_attention_kwargs = joint_attention_kwargs
+    self._interrupt = False
+    # 2. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    device = self._execution_device
+    lora_scale = (
+        self.joint_attention_kwargs.get("scale", None)
+        if self.joint_attention_kwargs is not None
+        else None
+    )
+    (
+        prompt_embeds,
+        pooled_prompt_embeds,
+        text_ids,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        device=device,
+        num_images_per_prompt=num_images_per_prompt,
+        max_sequence_length=max_sequence_length,
+        lora_scale=lora_scale,
+    )
+    # 4. Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels // 4
+    latents, latent_image_ids = self.prepare_latents(
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+    # 4.1. Prepare conditions
+    condition_latents, condition_ids, condition_type_ids = ([] for _ in range(3))
+    use_condition = conditions is not None or []
+    if use_condition:
+        assert len(conditions) <= 1, "Only one condition is supported for now."
+        if not default_lora:
+            pipeline.set_adapters(conditions[0].condition_type)
+        for condition in conditions:
+            tokens, ids, type_id = condition.encode(self)
+            #print(tokens.shape) # 20 1024 64
+            # bs, mask_num, channel, h, w = tokens.shape
+            tokens_reshape = tokens.reshape(1, -1, *tokens.shape[1:])
+            #print(tokens.shape) # 1 1024 64
+            condition_latents.append(tokens_reshape)  # [batch_size, token_n, token_dim]
+            condition_ids.append(ids)  # [token_n, id_dim(3)]
+            condition_type_ids.append(type_id)  # [token_n, 1]
+        condition_latents = torch.cat(condition_latents, dim=1)
+        condition_ids = torch.cat(condition_ids, dim=0)
+        condition_type_ids = torch.cat(condition_type_ids, dim=0)
+    # 5. Prepare timesteps
+    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+    image_seq_len = latents.shape[1]
+    mu = calculate_shift(
+        image_seq_len,
+        self.scheduler.config.base_image_seq_len,
+        self.scheduler.config.max_image_seq_len,
+        self.scheduler.config.base_shift,
+        self.scheduler.config.max_shift,
+    )
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler,
+        num_inference_steps,
+        device,
+        timesteps,
+        sigmas,
+        mu=mu,
+    )
+    num_warmup_steps = max(
+        len(timesteps) - num_inference_steps * self.scheduler.order, 0
+    )
+    self._num_timesteps = len(timesteps)
+    # 6. Denoising loop
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            if self.interrupt:
+                continue
+            cur_condition_latents = mogle(condition_latents,latents,t.expand(1))
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+            # handle guidance
+            if self.transformer.config.guidance_embeds:
+                guidance = torch.tensor([guidance_scale], device=device)
+                guidance = guidance.expand(latents.shape[0])
+            else:
+                guidance = None
+            noise_pred = tranformer_forward(
+                self.transformer,
+                model_config=model_config,
+                # Inputs of the condition (new feature)
+                condition_latents=cur_condition_latents if use_condition else None,
+                condition_ids=condition_ids if use_condition else None,
+                condition_type_ids=condition_type_ids if use_condition else None,
+                # Inputs to the original transformer
+                hidden_states=latents,
+                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+                timestep=timestep / 1000,
+                guidance=guidance,
+                pooled_projections=pooled_prompt_embeds,
+                encoder_hidden_states=prompt_embeds,
+                txt_ids=text_ids,
+                img_ids=latent_image_ids,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+            if latents.dtype != latents_dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    latents = latents.to(latents_dtype)
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or (
+                (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+            ):
+                progress_bar.update()
+    if output_type == "latent":
+        image = latents
+    else:
+        latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+        latents = (
+            latents / self.vae.config.scaling_factor
+        ) + self.vae.config.shift_factor
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+    # Offload all models
+    self.maybe_free_model_hooks()
+    if condition_scale != 1:
+        for name, module in pipeline.transformer.named_modules():
+            if not name.endswith(".attn"):
+                continue
+            del module.c_factor
+    if not return_dict:
+        return (image,)
+    return FluxPipelineOutput(images=image)

src/flux/lora_controller.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from peft.tuners.tuners_utils import BaseTunerLayer
+from typing import List, Any, Optional, Type
+class enable_lora:
+    def __init__(self, lora_modules: List[BaseTunerLayer], activated: bool) -> None:
+        self.activated: bool = activated
+        if activated:
+            return
+        self.lora_modules: List[BaseTunerLayer] = [
+            each for each in lora_modules if isinstance(each, BaseTunerLayer)
+        ]
+        self.scales = [
+            {
+                active_adapter: lora_module.scaling[active_adapter]
+                for active_adapter in lora_module.active_adapters
+            }
+            for lora_module in self.lora_modules
+        ]
+    def __enter__(self) -> None:
+        if self.activated:
+            return
+        for lora_module in self.lora_modules:
+            if not isinstance(lora_module, BaseTunerLayer):
+                continue
+            lora_module.scale_layer(0)
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[Any],
+    ) -> None:
+        if self.activated:
+            return
+        for i, lora_module in enumerate(self.lora_modules):
+            if not isinstance(lora_module, BaseTunerLayer):
+                continue
+            for active_adapter in lora_module.active_adapters:
+                lora_module.scaling[active_adapter] = self.scales[i][active_adapter]
+class set_lora_scale:
+    def __init__(self, lora_modules: List[BaseTunerLayer], scale: float) -> None:
+        self.lora_modules: List[BaseTunerLayer] = [
+            each for each in lora_modules if isinstance(each, BaseTunerLayer)
+        ]
+        self.scales = [
+            {
+                active_adapter: lora_module.scaling[active_adapter]
+                for active_adapter in lora_module.active_adapters
+            }
+            for lora_module in self.lora_modules
+        ]
+        self.scale = scale
+    def __enter__(self) -> None:
+        for lora_module in self.lora_modules:
+            if not isinstance(lora_module, BaseTunerLayer):
+                continue
+            lora_module.scale_layer(self.scale)
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[Any],
+    ) -> None:
+        for i, lora_module in enumerate(self.lora_modules):
+            if not isinstance(lora_module, BaseTunerLayer):
+                continue
+            for active_adapter in lora_module.active_adapters:
+                lora_module.scaling[active_adapter] = self.scales[i][active_adapter]

src/flux/pipeline_tools.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from diffusers.pipelines import FluxPipeline
+from diffusers.utils import logging
+from diffusers.pipelines.flux.pipeline_flux import logger
+from torch import Tensor
+def encode_images(pipeline: FluxPipeline, images: Tensor):
+    images = pipeline.image_processor.preprocess(images)
+    images = images.to(pipeline.device).to(pipeline.dtype)
+    images = pipeline.vae.encode(images).latent_dist.sample()
+    images = (
+        images - pipeline.vae.config.shift_factor
+    ) * pipeline.vae.config.scaling_factor
+    images_tokens = pipeline._pack_latents(images, *images.shape)
+    images_ids = pipeline._prepare_latent_image_ids(
+        images.shape[0],
+        images.shape[2],
+        images.shape[3],
+        pipeline.device,
+        pipeline.dtype,
+    )
+    if images_tokens.shape[1] != images_ids.shape[0]:
+        images_ids = pipeline._prepare_latent_image_ids(
+            images.shape[0],
+            images.shape[2] // 2,
+            images.shape[3] // 2,
+            pipeline.device,
+            pipeline.dtype,
+        )
+    return images_tokens, images_ids
+def prepare_text_input(pipeline: FluxPipeline, prompts, max_sequence_length=512):
+    # Turn off warnings (CLIP overflow)
+    logger.setLevel(logging.ERROR)
+    (
+        prompt_embeds,
+        pooled_prompt_embeds,
+        text_ids,
+    ) = pipeline.encode_prompt(
+        prompt=prompts,
+        prompt_2=None,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        device=pipeline.device,
+        num_images_per_prompt=1,
+        max_sequence_length=max_sequence_length,
+        lora_scale=None,
+    )
+    # Turn on warnings
+    logger.setLevel(logging.WARNING)
+    return prompt_embeds, pooled_prompt_embeds, text_ids

src/flux/transformer.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import torch
+from diffusers.pipelines import FluxPipeline
+from typing import List, Union, Optional, Dict, Any, Callable
+from .block import block_forward, single_block_forward
+from .lora_controller import enable_lora
+from accelerate.utils import is_torch_version
+from diffusers.models.transformers.transformer_flux import (
+    FluxTransformer2DModel,
+    Transformer2DModelOutput,
+    USE_PEFT_BACKEND,
+    scale_lora_layers,
+    unscale_lora_layers,
+    logger,
+)
+import numpy as np
+def prepare_params(
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: torch.Tensor = None,
+    pooled_projections: torch.Tensor = None,
+    timestep: torch.LongTensor = None,
+    img_ids: torch.Tensor = None,
+    txt_ids: torch.Tensor = None,
+    guidance: torch.Tensor = None,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    controlnet_block_samples=None,
+    controlnet_single_block_samples=None,
+    return_dict: bool = True,
+    **kwargs: dict,
+):
+    return (
+        hidden_states,
+        encoder_hidden_states,
+        pooled_projections,
+        timestep,
+        img_ids,
+        txt_ids,
+        guidance,
+        joint_attention_kwargs,
+        controlnet_block_samples,
+        controlnet_single_block_samples,
+        return_dict,
+    )
+def tranformer_forward(
+    transformer: FluxTransformer2DModel,
+    condition_latents: torch.Tensor,
+    condition_ids: torch.Tensor,
+    condition_type_ids: torch.Tensor,
+    model_config: Optional[Dict[str, Any]] = {},
+    c_t=0,
+    **params: dict,
+):
+    self = transformer
+    use_condition = condition_latents is not None
+    (
+        hidden_states,
+        encoder_hidden_states,
+        pooled_projections,
+        timestep,
+        img_ids,
+        txt_ids,
+        guidance,
+        joint_attention_kwargs,
+        controlnet_block_samples,
+        controlnet_single_block_samples,
+        return_dict,
+    ) = prepare_params(**params)
+    if joint_attention_kwargs is not None:
+        joint_attention_kwargs = joint_attention_kwargs.copy()
+        lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+    else:
+        lora_scale = 1.0
+    if USE_PEFT_BACKEND:
+        # weight the lora layers by setting `lora_scale` for each PEFT layer
+        scale_lora_layers(self, lora_scale)
+    else:
+        if (
+            joint_attention_kwargs is not None
+            and joint_attention_kwargs.get("scale", None) is not None
+        ):
+            logger.warning(
+                "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+            )
+    with enable_lora((self.x_embedder,), model_config.get("latent_lora", False)):
+        hidden_states = self.x_embedder(hidden_states)
+        # print("hidden states :",hidden_states.shape) hidden states : torch.Size([2, 1024, 3072])
+    condition_latents = self.x_embedder(condition_latents) if use_condition else None
+    # print(f"condition_latents shape {condition_latents.shape}") condition_latents shape torch.Size([2, 1024, 3072])
+    timestep = timestep.to(hidden_states.dtype) * 1000
+    if guidance is not None:
+        guidance = guidance.to(hidden_states.dtype) * 1000
+    else:
+        guidance = None
+    temb = (
+        self.time_text_embed(timestep, pooled_projections)
+        if guidance is None
+        else self.time_text_embed(timestep, guidance, pooled_projections)
+    )
+    # print(f"temb shape:{temb.shape}") torch.Size([2, 3072])
+    cond_temb = (
+        self.time_text_embed(torch.ones_like(timestep) * c_t * 1000, pooled_projections)
+        if guidance is None
+        else self.time_text_embed(
+            torch.ones_like(timestep) * c_t * 1000, guidance, pooled_projections
+        )
+    )
+    # print("cond temb shape",cond_temb.shape) torch.Size([2, 3072])
+    encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+    # print(f"encoder hidden states {encoder_hidden_states.shape}") torch.Size([2, 512, 3072])
+    if txt_ids.ndim == 3:
+        logger.warning(
+            "Passing `txt_ids` 3d torch.Tensor is deprecated."
+            "Please remove the batch dimension and pass it as a 2d torch Tensor"
+        )
+        txt_ids = txt_ids[0]
+    if img_ids.ndim == 3:
+        logger.warning(
+            "Passing `img_ids` 3d torch.Tensor is deprecated."
+            "Please remove the batch dimension and pass it as a 2d torch Tensor"
+        )
+        img_ids = img_ids[0]
+    ids = torch.cat((txt_ids, img_ids), dim=0) # 1536 3
+    image_rotary_emb = self.pos_embed(ids) # 2 1536 128
+    if use_condition:
+        # condition_ids[:, :1] = condition_type_ids
+        cond_rotary_emb = self.pos_embed(condition_ids) # 2 1536 128
+    # hidden_states = torch.cat([hidden_states, condition_latents], dim=1)
+    for index_block, block in enumerate(self.transformer_blocks):
+        if self.training and self.gradient_checkpointing:
+            ckpt_kwargs: Dict[str, Any] = (
+                {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+            )
+            encoder_hidden_states, hidden_states, condition_latents = (
+                torch.utils.checkpoint.checkpoint(
+                    block_forward,
+                    self=block,
+                    model_config=model_config,
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    condition_latents=condition_latents if use_condition else None,
+                    temb=temb,
+                    cond_temb=cond_temb if use_condition else None,
+                    cond_rotary_emb=cond_rotary_emb if use_condition else None,
+                    image_rotary_emb=image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            )
+        else:
+            encoder_hidden_states, hidden_states, condition_latents = block_forward(
+                block,
+                model_config=model_config,
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                condition_latents=condition_latents if use_condition else None,
+                temb=temb,
+                cond_temb=cond_temb if use_condition else None,
+                cond_rotary_emb=cond_rotary_emb if use_condition else None,
+                image_rotary_emb=image_rotary_emb,
+            )
+        # controlnet residual
+        if controlnet_block_samples is not None:
+            interval_control = len(self.transformer_blocks) / len(
+                controlnet_block_samples
+            )
+            interval_control = int(np.ceil(interval_control))
+            hidden_states = (
+                hidden_states
+                + controlnet_block_samples[index_block // interval_control]
+            )
+    hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+    for index_block, block in enumerate(self.single_transformer_blocks):
+        if self.training and self.gradient_checkpointing:
+            ckpt_kwargs: Dict[str, Any] = (
+                {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+            )
+            result = torch.utils.checkpoint.checkpoint(
+                single_block_forward,
+                self=block,
+                model_config=model_config,
+                hidden_states=hidden_states,
+                temb=temb,
+                image_rotary_emb=image_rotary_emb,
+                **(
+                    {
+                        "condition_latents": condition_latents,
+                        "cond_temb": cond_temb,
+                        "cond_rotary_emb": cond_rotary_emb,
+                    }
+                    if use_condition
+                    else {}
+                ),
+                **ckpt_kwargs,
+            )
+        else:
+            result = single_block_forward(
+                block,
+                model_config=model_config,
+                hidden_states=hidden_states,
+                temb=temb,
+                image_rotary_emb=image_rotary_emb,
+                **(
+                    {
+                        "condition_latents": condition_latents,
+                        "cond_temb": cond_temb,
+                        "cond_rotary_emb": cond_rotary_emb,
+                    }
+                    if use_condition
+                    else {}
+                ),
+            )
+        if use_condition:
+            hidden_states, condition_latents = result
+        else:
+            hidden_states = result
+        # controlnet residual
+        if controlnet_single_block_samples is not None:
+            interval_control = len(self.single_transformer_blocks) / len(
+                controlnet_single_block_samples
+            )
+            interval_control = int(np.ceil(interval_control))
+            hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
+                hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                + controlnet_single_block_samples[index_block // interval_control]
+            )
+    hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+    hidden_states = self.norm_out(hidden_states, temb)
+    output = self.proj_out(hidden_states)
+    # print(f"output shape:{output.shape}")
+    if USE_PEFT_BACKEND:
+        # remove `lora_scale` from each PEFT layer
+        unscale_lora_layers(self, lora_scale)
+    if not return_dict:
+        return (output,)
+    return Transformer2DModelOutput(sample=output)

src/moe/__pycache__/mogle.cpython-311.pyc ADDED Viewed

Binary file (7.47 kB). View file

src/moe/mogle.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+import torch.nn as nn
+from diffusers.models.embeddings import Timesteps, TimestepEmbedding
+import torch.optim as optim
+from torch.nn import functional as F
+# Define the Expert Network
+class Expert(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim, use_softmax=False):
+        super(Expert, self).__init__()
+        self.use_softmax = use_softmax
+        self.net = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, output_dim),
+        )
+    def forward(self, x):
+        return (
+            self.net(x) if not self.use_softmax else torch.softmax(self.net(x), dim=1)
+        )
+class DynamicGatingNetwork(nn.Module):
+    def __init__(self, hidden_dim=64, embed_dim=64, dtype=torch.bfloat16):
+        super().__init__()
+        # 处理时间步
+        self.time_proj = Timesteps(
+            hidden_dim, flip_sin_to_cos=True, downscale_freq_shift=0
+        )
+        self.timestep_embedding = TimestepEmbedding(hidden_dim, embed_dim)
+        self.timestep_embedding = self.timestep_embedding.to(dtype=torch.bfloat16)
+        # 处理 noise_latent
+        self.noise_proj = nn.Linear(hidden_dim, hidden_dim)
+        self.dtype = dtype
+        # 权重计算
+        self.gate = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 20),  # 生成两个权重
+        )
+    def forward(self, condition_latents, noise_latent, timestep):
+        """
+        global_latents: (bs, 1024, 64)
+        noise_latent: (bs, 1024, 64)
+        timestep: (bs,)
+        """
+        bs, seq_len, hidden_dim = condition_latents.shape
+        # 处理 timestep
+        time_emb = self.time_proj(timestep)  # (bs, hidden_dim)
+        time_emb = time_emb.to(self.dtype)
+        time_emb = self.timestep_embedding(time_emb)  # (bs, embed_dim)
+        time_emb = time_emb.unsqueeze(1).expand(
+            -1, seq_len, -1
+        )  # (bs, 1024, embed_dim)
+        # 处理 noise_latent
+        noise_emb = self.noise_proj(noise_latent)  # (bs, 1024, 64)
+        # 拼接所有输入
+        # fused_input = torch.cat([condition_latents, noise_emb, time_emb], dim=2)  # (bs, 1024, 64+64+128)
+        fused_input = condition_latents + noise_emb + time_emb
+        # 计算权重
+        weight = self.gate(fused_input)  # (bs, 1024, 2)
+        weight = F.softmax(weight, dim=2)  # 归一化
+        return weight
+class MoGLE(nn.Module):
+    def __init__(
+        self,
+        num_experts=20,
+        input_dim=64,
+        hidden_dim=32,
+        output_dim=64,
+        has_expert=True,
+        has_gating=True,
+        weight_is_scale=False,
+    ):
+        super().__init__()
+        expert_model = None
+        if has_expert:
+            expert_model = Expert
+        else:
+            expert_model = nn.Identity
+        self.global_expert = expert_model(
+            input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim
+        )
+        self.local_experts = nn.ModuleList(
+            [
+                expert_model(
+                    input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim
+                )
+                for _ in range(num_experts - 1)
+            ]
+        )
+        # self.gating = Gating(input_dim=input_dim, num_experts=num_experts)
+        if has_gating:
+            self.gating = DynamicGatingNetwork()
+        else:
+            self.gating = nn.Identity()
+        self.weight_is_scale = weight_is_scale
+    def forward(self, x: torch.Tensor, noise_latent, timestep):
+        global_mask = x[:, 0]  # bs 1024 64
+        local_mask = x[:, 1:]  # bs 19 1024 64
+        if not isinstance(self.gating, nn.Identity):
+            weights = self.gating.forward(
+                global_mask, noise_latent=noise_latent, timestep=timestep
+            )  # bs 1024 20
+        _, num_local, h, w = local_mask.shape
+        global_output = self.global_expert(global_mask).unsqueeze(1)
+        local_outputs = torch.stack(
+            [self.local_experts[i](local_mask[:, i]) for i in range(num_local)], dim=1
+        )  # (bs, 19, 1024, 64)
+        global_local_outputs = torch.cat(
+            [global_output, local_outputs], dim=1
+        )  # bs 20 1024 64
+        if isinstance(self.gating, nn.Identity):
+            global_local_outputs = global_local_outputs.sum(dim=1)
+            return global_local_outputs
+        if self.weight_is_scale:
+            weights = torch.mean(weights, dim=1, keepdim=True)  # bs 1 20
+            # print("gating scale")
+        weights_expanded = weights.unsqueeze(-1)
+        output = (global_local_outputs.permute(0, 2, 1, 3) * weights_expanded).sum(
+            dim=2
+        )
+        return output  # bs 1024 64

src/train/callbacks.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import lightning as L
+from PIL import Image, ImageFilter, ImageDraw
+import numpy as np
+from transformers import pipeline
+import cv2
+import torch
+import os
+from torchvision import transforms as T
+try:
+    import wandb
+except ImportError:
+    wandb = None
+from ..flux.condition import Condition
+from ..flux.generate import generate
+class FaceMoGLECallback(L.Callback):
+    def __init__(self, run_name, training_config: dict = {}):
+        self.run_name, self.training_config = run_name, training_config
+        self.print_every_n_steps = training_config.get("print_every_n_steps", 10)
+        self.save_interval = training_config.get("save_interval", 1000)
+        self.sample_interval = training_config.get("sample_interval", 1000)
+        self.save_path = training_config.get("save_path", "./runs")
+        self.wandb_config = training_config.get("wandb", None)
+        self.use_wandb = (
+            wandb is not None and os.environ.get("WANDB_API_KEY") is not None
+        )
+        self.total_steps = 0
+    def to_tensor(self, x):
+        return T.ToTensor()(x)
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        gradient_size = 0
+        max_gradient_size = 0
+        count = 0
+        for _, param in pl_module.named_parameters():
+            if param.grad is not None:
+                gradient_size += param.grad.norm(2).item()
+                max_gradient_size = max(max_gradient_size, param.grad.norm(2).item())
+                count += 1
+        if count > 0:
+            gradient_size /= count
+        self.total_steps += 1
+        # Print training progress every n steps
+        if self.use_wandb:
+            report_dict = {
+                "steps": batch_idx,
+                "steps": self.total_steps,
+                "epoch": trainer.current_epoch,
+                "gradient_size": gradient_size,
+            }
+            loss_value = outputs["loss"].item() * trainer.accumulate_grad_batches
+            report_dict["loss"] = loss_value
+            report_dict["t"] = pl_module.last_t
+            wandb.log(report_dict)
+        if self.total_steps % self.print_every_n_steps == 0:
+            print(
+                f"Epoch: {trainer.current_epoch}, Steps: {self.total_steps}, Batch: {batch_idx}, Loss: {pl_module.log_loss:.4f}, Gradient size: {gradient_size:.4f}, Max gradient size: {max_gradient_size:.4f}"
+            )
+        # Save LoRA weights at specified intervals
+        if self.total_steps % self.save_interval == 0:
+            print(
+                f"Epoch: {trainer.current_epoch}, Steps: {self.total_steps} - Saving LoRA weights"
+            )
+            pl_module.save_lora(
+                f"{self.save_path}/{self.run_name}/ckpt/{self.total_steps}"
+            )
+            if hasattr(pl_module, "save_moe"):
+                pl_module.save_moe(
+                    f"{self.save_path}/{self.run_name}/ckpt/{self.total_steps}/moe.pt"
+                )
+        # Generate and save a sample image at specified intervals
+        if self.total_steps % self.sample_interval == 0:
+            print(
+                f"Epoch: {trainer.current_epoch}, Steps: {self.total_steps} - Generating a sample"
+            )
+            self.generate_a_sample(
+                trainer,
+                pl_module,
+                f"{self.save_path}/{self.run_name}/output",
+                f"lora_{self.total_steps}",
+                batch["condition_type"][
+                    0
+                ],  # Use the condition type from the current batch
+            )
+    @torch.no_grad()
+    def generate_a_sample(
+        self,
+        trainer,
+        pl_module,
+        save_path,
+        file_name,
+        condition_type="super_resolution",
+    ):
+        # TODO: change this two variables to parameters
+        target_size = trainer.training_config["dataset"]["target_size"]
+        position_scale = trainer.training_config["dataset"].get("position_scale", 1.0)
+        generator = torch.Generator(device=pl_module.device)
+        generator.manual_seed(42)
+        test_list = []
+        condition_img_path = "data/mmcelebahq/mask/27000.png"
+        # condition_img = self.deepth_pipe(condition_img)["depth"].convert("RGB")
+        test_list.append(
+            (
+                condition_img_path,
+                [0, 0],
+                "She is wearing lipstick. She is attractive and has straight hair.",
+                {"position_scale": position_scale} if position_scale != 1.0 else {},
+            )
+        )
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        for i, (condition_img_path, position_delta, prompt, *others) in enumerate(
+            test_list
+        ):
+            global_mask = Image.open(condition_img_path).convert("RGB")
+            mask_list = [self.to_tensor(global_mask)]
+            mask = Image.open(condition_img_path)
+            mask = np.array(mask)
+            for i in range(19):
+                local_mask = np.zeros_like(mask)
+                local_mask[mask == i] = 255
+                local_mask_rgb = Image.fromarray(local_mask).convert("RGB")
+                local_mask_tensor = self.to_tensor(local_mask_rgb)
+                mask_list.append(local_mask_tensor)
+            condition_img = torch.stack(mask_list, dim=0)
+            # condition_img = condition_img.unsqueeze(0)
+            condition = Condition(
+                condition_type=condition_type,
+                condition=condition_img,
+                position_delta=position_delta,
+                **(others[0] if others else {}),
+            )
+            res = generate(
+                pl_module.flux_pipe,
+                mogle=pl_module.mogle,
+                prompt=prompt,
+                conditions=[condition],
+                height=target_size,
+                width=target_size,
+                generator=generator,
+                model_config=pl_module.model_config,
+                default_lora=True,
+            )
+            res.images[0].save(
+                os.path.join(save_path, f"{file_name}_{condition_type}_{i}.jpg")
+            )

src/train/data.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from PIL import Image
+import os
+import numpy as np
+from torch.utils.data import Dataset
+import torchvision.transforms as T
+import random
+import torch
+import json
+class MMCelebAHQ(Dataset):
+    def __init__(
+        self,
+        root="data/mmcelebahq",
+        condition_size: int = 512,
+        target_size: int = 512,
+        condition_type: str = "depth",
+        drop_text_prob: float = 0.1,
+        drop_image_prob: float = 0.1,
+        return_pil_image: bool = False,
+        position_scale=1.0,
+    ):
+        self.root = root
+        self.face_paths, self.mask_paths, self.prompts = self.get_face_mask_prompt()
+        self.condition_size = condition_size
+        self.target_size = target_size
+        self.condition_type = condition_type
+        self.drop_text_prob = drop_text_prob
+        self.drop_image_prob = drop_image_prob
+        self.return_pil_image = return_pil_image
+        self.position_scale = position_scale
+        self.to_tensor = T.ToTensor()
+    def get_face_mask_prompt(self):
+        face_paths = [
+            os.path.join(self.root, "face", f"{i}.jpg") for i in range(0, 27000)
+        ]
+        mask_paths = [
+            os.path.join(self.root, "mask", f"{i}.png") for i in range(0, 27000)
+        ]
+        with open(os.path.join(self.root, "text.json"), mode="r") as f:
+            prompts = json.load(f)
+        return face_paths, mask_paths, prompts
+    def __len__(self):
+        return len(self.face_paths)
+    def __getitem__(self, idx):
+        image = Image.open(self.face_paths[idx]).convert("RGB")
+        prompts = self.prompts[f"{idx}.jpg"]
+        description = random.choices(prompts, k=1)[0].strip()
+        enable_scale = random.random() < 1
+        if not enable_scale:
+            condition_size = int(self.condition_size * self.position_scale)
+            position_scale = 1.0
+        else:
+            condition_size = self.condition_size
+            position_scale = self.position_scale
+        # Get the condition image
+        position_delta = np.array([0, 0])
+        mask = np.array(Image.open(self.mask_paths[idx]))
+        mask_list = [self.to_tensor(Image.open(self.mask_paths[idx]).convert("RGB"))]
+        for i in range(19):
+            local_mask = np.zeros_like(mask)
+            local_mask[mask == i] = 255
+            drop_image = random.random() < self.drop_image_prob
+            if drop_image:
+                local_mask = np.zeros_like(mask)
+            local_mask_rgb = Image.fromarray(local_mask).convert("RGB")
+            local_mask_tensor = self.to_tensor(local_mask_rgb)
+            mask_list.append(local_mask_tensor)
+        condition_img = torch.stack(mask_list,dim=0)
+        # Randomly drop text or image
+        drop_text = random.random() < self.drop_text_prob
+        # drop_image = random.random() < self.drop_image_prob
+        if drop_text:
+            description = ""
+        return {
+            "image": self.to_tensor(image),
+            "condition": condition_img,
+            # "condition": self.to_tensor(condition_img),
+            "condition_type": self.condition_type,
+            "description": description,
+            "position_delta": position_delta,
+            **({"pil_image": [image, condition_img]} if self.return_pil_image else {}),
+            **({"position_scale": position_scale} if position_scale != 1.0 else {}),
+        }

src/train/model.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import lightning as L
+from diffusers.pipelines import FluxPipeline
+import torch
+from peft import LoraConfig, get_peft_model_state_dict
+import prodigyopt
+import os
+from ..flux.transformer import tranformer_forward
+from ..flux.condition import Condition
+from ..flux.pipeline_tools import encode_images, prepare_text_input
+from ..moe.mogle import MoGLE
+class FaceMoGLE(L.LightningModule):
+    def __init__(
+        self,
+        flux_pipe_id: str,
+        lora_path: str = None,
+        lora_config: dict = None,
+        device: str = "cuda",
+        dtype: torch.dtype = torch.bfloat16,
+        model_config: dict = {},
+        optimizer_config: dict = None,
+        gradient_checkpointing: bool = False,
+        has_expert=True,
+        has_gating=True,
+        weight_is_scale=False
+    ):
+        # Initialize the LightningModule
+        super().__init__()
+        self.model_config = model_config
+        self.optimizer_config = optimizer_config
+        # Load the Flux pipeline
+        self.flux_pipe: FluxPipeline = (
+            FluxPipeline.from_pretrained(flux_pipe_id).to(dtype=dtype).to(device)
+        )
+        self.transformer = self.flux_pipe.transformer
+        self.transformer.gradient_checkpointing = gradient_checkpointing
+        self.transformer.train()
+        self.mogle = MoGLE(has_expert=has_expert,has_gating=has_gating,weight_is_scale=weight_is_scale)
+        self.mogle.train()
+        # Freeze the Flux pipeline
+        self.flux_pipe.text_encoder.requires_grad_(False).eval()
+        self.flux_pipe.text_encoder_2.requires_grad_(False).eval()
+        self.flux_pipe.vae.requires_grad_(False).eval()
+        # Initialize LoRA layers
+        self.lora_layers = self.init_lora(lora_path, lora_config)
+        self.to(device).to(dtype)
+    def init_lora(self, lora_path: str, lora_config: dict):
+        assert lora_path or lora_config
+        if lora_path:
+            # TODO: Implement this
+            raise NotImplementedError
+        else:
+            self.transformer.add_adapter(LoraConfig(**lora_config))
+            # TODO: Check if this is correct (p.requires_grad)
+            lora_layers = filter(
+                lambda p: p.requires_grad, self.transformer.parameters()
+            )
+        return list(lora_layers)
+    def save_lora(self, path: str):
+        FluxPipeline.save_lora_weights(
+            save_directory=path,
+            transformer_lora_layers=get_peft_model_state_dict(self.transformer),
+            safe_serialization=True,
+        )
+        torch.save(self.mogle.state_dict(), os.path.join(path, "mogle.pt"))
+    def configure_optimizers(self):
+        # Freeze the transformer
+        self.transformer.requires_grad_(False)
+        opt_config = self.optimizer_config
+        # Set the trainable parameters
+        self.trainable_params = self.lora_layers + [p for p in self.mogle.parameters()]
+        # Unfreeze trainable parameters
+        for p in self.trainable_params:
+            p.requires_grad_(True)
+        # Initialize the optimizer
+        if opt_config["type"] == "AdamW":
+            optimizer = torch.optim.AdamW(self.trainable_params, **opt_config["params"])
+        elif opt_config["type"] == "Prodigy":
+            optimizer = prodigyopt.Prodigy(
+                self.trainable_params,
+                **opt_config["params"],
+            )
+        elif opt_config["type"] == "SGD":
+            optimizer = torch.optim.SGD(self.trainable_params, **opt_config["params"])
+        else:
+            raise NotImplementedError
+        return optimizer
+    def training_step(self, batch, batch_idx):
+        step_loss = self.step(batch)
+        self.log_loss = (
+            step_loss.item()
+            if not hasattr(self, "log_loss")
+            else self.log_loss * 0.95 + step_loss.item() * 0.05
+        )
+        return step_loss
+    def step(self, batch):
+        imgs = batch["image"]
+        conditions = batch["condition"] # bsx20x3x512x512
+        condition_types = batch["condition_type"]
+        prompts = batch["description"]
+        position_delta = batch["position_delta"][0]
+        position_scale = float(batch.get("position_scale", [1.0])[0])
+        # Prepare inputs
+        with torch.no_grad():
+            # Prepare image input
+            x_0, img_ids = encode_images(self.flux_pipe, imgs)
+            # Prepare text input
+            prompt_embeds, pooled_prompt_embeds, text_ids = prepare_text_input(
+                self.flux_pipe, prompts
+            )
+            # Prepare t and x_t
+            t = torch.sigmoid(torch.randn((imgs.shape[0],), device=self.device))
+            x_1 = torch.randn_like(x_0).to(self.device)
+            t_ = t.unsqueeze(1).unsqueeze(1)
+            x_t = ((1 - t_) * x_0 + t_ * x_1).to(self.dtype)
+        # Prepare conditions # condition_latents \in bsx64x32x32 -> bsx(32x32)x64, condition_ids \in [1024, 3]
+        # intial conditions shape [bs, 19, 3, 512, 512] reshape to [bsx19, 3, 512, 512]
+        c_bs, c_classes, c_channels, c_h, c_w = conditions.shape
+        conditions = conditions.view(c_bs * c_classes, c_channels, c_h, c_w)
+        condition_latents, condition_ids = encode_images(self.flux_pipe, conditions)
+        condition_latents_reshape = condition_latents.reshape(c_bs, c_classes, *condition_latents.shape[-2:]) # bs 20 1024 64
+        condition_latents = self.mogle.forward(condition_latents_reshape,noise_latent=x_t,timestep=t)
+        # conditions shape [bsx19, 1024, 64] # this is condition features
+        # condition_ids shape [1024, 3] # this is position embedding
+        # help me design a simple MoE to fuse 19 condition_latents
+        # Add position delta
+        condition_ids[:, 1] += position_delta[0]
+        condition_ids[:, 2] += position_delta[1]
+        if position_scale != 1.0:
+            scale_bias = (position_scale - 1.0) / 2
+            condition_ids[:, 1] *= position_scale
+            condition_ids[:, 2] *= position_scale
+            condition_ids[:, 1] += scale_bias
+            condition_ids[:, 2] += scale_bias
+        # Prepare condition type
+        condition_type_ids = torch.tensor(
+            [
+                Condition.get_type_id(condition_type)
+                for condition_type in condition_types
+            ]
+        ).to(self.device)
+        condition_type_ids = (
+            torch.ones_like(condition_ids[:, 0]) * condition_type_ids[0]
+        ).unsqueeze(1)
+        # Prepare guidance
+        guidance = (
+            torch.ones_like(t).to(self.device)
+            if self.transformer.config.guidance_embeds
+            else None
+        )
+        # Forward pass
+        transformer_out = tranformer_forward(
+            self.transformer,
+            # Model config
+            model_config=self.model_config,
+            # Inputs of the condition (new feature)
+            condition_latents=condition_latents,
+            condition_ids=condition_ids,
+            condition_type_ids=condition_type_ids,
+            # Inputs to the original transformer
+            hidden_states=x_t,
+            timestep=t,
+            guidance=guidance,
+            pooled_projections=pooled_prompt_embeds,
+            encoder_hidden_states=prompt_embeds,
+            txt_ids=text_ids,
+            img_ids=img_ids,
+            joint_attention_kwargs=None,
+            return_dict=False,
+        )
+        pred = transformer_out[0]
+        # Compute loss
+        loss = torch.nn.functional.mse_loss(pred, (x_1 - x_0), reduction="mean")
+        self.last_t = t.mean().item()
+        return loss

weights/mogle.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b071a349d1e8f922d32a066014f9cc80b39f8db55043d8bdf04e79e156d4f243
+size 238252

weights/pytorch_lora_weights.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b2202f249a33252ce4f630db2f9536a28caf4b90e27927633f1f3bbb121f774
+size 29066872