Spaces:

phxdev
/

dark-pixe

Sleeping

App Files Files Community

marks commited on Nov 28, 2024

Commit

3778bc0

1 Parent(s): 002c192

Init

Browse files

Files changed (32) hide show

.gitignore +17 -0
Dockerfile +30 -0
LICENSE +201 -0
api.py +122 -0
app.py +167 -4
configs/config-dev-1-RTX6000ADA-Copy1.json +57 -0
configs/config-dev-1-RTX6000ADA.json +57 -0
configs/config-dev-cuda0.json +56 -0
configs/config-dev-eval.json +57 -0
configs/config-dev-gigaquant.json +58 -0
configs/config-dev-offload-1-4080.json +58 -0
configs/config-dev-offload-1-4090.json +58 -0
configs/config-dev-offload.json +58 -0
configs/config-dev-prequant.json +57 -0
configs/config-dev.json +57 -0
configs/config-f8.json +48 -0
configs/config-schnell-cuda0.json +57 -0
configs/config-schnell.json +57 -0
dark.safetensors +3 -0
f8.json +48 -0
float8_quantize.py +496 -0
flux_emphasis.py +447 -0
flux_pipeline.py +729 -0
image_encoder.py +35 -0
lora_loading.py +753 -0
main.py +199 -0
modules/autoencoder.py +336 -0
modules/conditioner.py +128 -0
modules/flux_model.py +734 -0
photo.safetensors +3 -0
start.py +0 -0
util.py +333 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,17 @@

+__pycache__
+*.jpg
+*.png
+*.jpeg
+*.gif
+*.bmp
+*.webp
+*.mp4
+*.mp3
+*.mp3
+*.txt
+.copilotignore
+.misc
+BFL-flux-diffusers
+.env
+.env.*
+perfection.safetensors

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+# Base image with Python 3.11, PyTorch, CUDA 12.4.1, and Ubuntu 22.04
+FROM runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04
+# Set the working directory inside the container
+WORKDIR /workspace
+# Install necessary packages and dependencies
+RUN apt-get update && apt-get install -y \
+    wget \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Clone your repository
+RUN git clone https://github.com/Yuanshi9815/OminiControl
+# Change directory to the cloned repo
+WORKDIR /workspace/fp8
+# Install Python dependencies
+RUN pip install -r requirements.txt
+# Download the required model files
+RUN wget https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/flux1-schnell.safetensors -O /workspace/flux1-schnell.safetensors && \
+    wget https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/ae.safetensors -O /workspace/ae.safetensors
+# Expose necessary HTTP ports
+EXPOSE 8888 7860
+# Set the command to run your Python script
+CMD ["python", "main_gr.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024 Alex Redden
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

api.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from typing import Literal, Optional, TYPE_CHECKING
+import numpy as np
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse, JSONResponse
+from pydantic import BaseModel, Field
+from platform import system
+if TYPE_CHECKING:
+    from flux_pipeline import FluxPipeline
+if system() == "Windows":
+    MAX_RAND = 2**16 - 1
+else:
+    MAX_RAND = 2**32 - 1
+class AppState:
+    model: "FluxPipeline"
+class FastAPIApp(FastAPI):
+    state: AppState
+class LoraArgs(BaseModel):
+    scale: Optional[float] = 1.0
+    path: Optional[str] = None
+    name: Optional[str] = None
+    action: Optional[Literal["load", "unload"]] = "load"
+class LoraLoadResponse(BaseModel):
+    status: Literal["success", "error"]
+    message: Optional[str] = None
+class GenerateArgs(BaseModel):
+    prompt: str
+    width: Optional[int] = Field(default=720)
+    height: Optional[int] = Field(default=1024)
+    num_steps: Optional[int] = Field(default=24)
+    guidance: Optional[float] = Field(default=3.5)
+    seed: Optional[int] = Field(
+        default_factory=lambda: np.random.randint(0, MAX_RAND), gt=0, lt=MAX_RAND
+    )
+    strength: Optional[float] = 1.0
+    init_image: Optional[str] = None
+app = FastAPIApp()
+@app.post("/generate")
+def generate(args: GenerateArgs):
+    """
+    Generates an image from the Flux flow transformer.
+    Args:
+        args (GenerateArgs): Arguments for image generation:
+            - `prompt`: The prompt used for image generation.
+            - `width`: The width of the image.
+            - `height`: The height of the image.
+            - `num_steps`: The number of steps for the image generation.
+            - `guidance`: The guidance for image generation, represents the
+                influence of the prompt on the image generation.
+            - `seed`: The seed for the image generation.
+            - `strength`: strength for image generation, 0.0 - 1.0.
+                Represents the percent of diffusion steps to run,
+                setting the init_image as the noised latent at the
+                given number of steps.
+            - `init_image`: Base64 encoded image or path to image to use as the init image.
+    Returns:
+        StreamingResponse: The generated image as streaming jpeg bytes.
+    """
+    result = app.state.model.generate(**args.model_dump())
+    return StreamingResponse(result, media_type="image/jpeg")
+@app.post("/lora", response_model=LoraLoadResponse)
+def lora_action(args: LoraArgs):
+    """
+    Loads or unloads a LoRA checkpoint into / from the Flux flow transformer.
+    Args:
+        args (LoraArgs): Arguments for the LoRA action:
+            - `scale`: The scaling factor for the LoRA weights.
+            - `path`: The path to the LoRA checkpoint.
+            - `name`: The name of the LoRA checkpoint.
+            - `action`: The action to perform, either "load" or "unload".
+    Returns:
+        LoraLoadResponse: The status of the LoRA action.
+    """
+    try:
+        if args.action == "load":
+            app.state.model.load_lora(args.path, args.scale, args.name)
+        elif args.action == "unload":
+            app.state.model.unload_lora(args.name if args.name else args.path)
+        else:
+            return JSONResponse(
+                content={
+                    "status": "error",
+                    "message": f"Invalid action, expected 'load' or 'unload', got {args.action}",
+                },
+                status_code=400,
+            )
+    except Exception as e:
+        return JSONResponse(
+            status_code=500, content={"status": "error", "message": str(e)}
+        )
+    return JSONResponse(status_code=200, content={"status": "success"})

app.py CHANGED Viewed

@@ -1,7 +1,170 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import os
+import subprocess
+import spaces
+import torch
+from safetensors.torch import load_file
+from flux_pipeline import FluxPipeline
 import gradio as gr
+from PIL import Image
+def download_models():
+    """
+    Download required models at application startup using wget.
+    """
+    model_urls = [
+        "https://huggingface.co/black-forest-labs/FLUX.1-dev/resolve/main/flux1-dev.safetensors",
+        "https://huggingface.co/black-forest-labs/FLUX.1-dev/resolve/main/ae.safetensors",
+    ]
+    for url in model_urls:
+        filename = url.split("/")[-1]
+        if not os.path.exists(filename):
+            print(f"Downloading {filename}...")
+            subprocess.run(["wget", "-O", filename, url], check=True)
+        else:
+            print(f"{filename} already exists, skipping download.")
+    print("All models are ready.")
+def load_sft(ckpt_path, device="cpu"):
+    """
+    Load a safetensors file.
+    Args:
+        ckpt_path (str): Local path to the safetensors file.
+        device (str): Device to load the file onto.
+    Returns:
+        Safetensors model state dictionary.
+    """
+    if os.path.exists(ckpt_path):
+        print(f"Loading local checkpoint: {ckpt_path}")
+        return load_file(ckpt_path, device=device)
+    else:
+        raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
+def create_demo(config_path: str):
+    generator = FluxPipeline.load_pipeline_from_config_path(config_path)
+    load_sft("photo.safetenors", "cuda")
+    load_sft("dark.safetensors", "cuda")
+    load_sft("perfection.safetensors", "cuda")
+    @spaces.GPU
+    def generate_image(
+        prompt,
+        width,
+        height,
+        num_steps,
+        guidance,
+        seed,
+        init_image,
+        image2image_strength,
+        add_sampling_metadata,
+    ):
+        seed = int(seed)
+        if seed == -1:
+            seed = None
+        out = generator.generate(
+            prompt,
+            width,
+            height,
+            num_steps=num_steps,
+            guidance=guidance,
+            seed=seed,
+            init_image=init_image,
+            strength=image2image_strength,
+            silent=False,
+            num_images=1,
+            return_seed=True,
+        )
+        image_bytes = out[0]
+        return Image.open(image_bytes), str(out[1]), None
+    is_schnell = generator.config.version == "flux-schnell"
+    with gr.Blocks() as demo:
+        gr.Markdown(f"# Flux Image Generation Demo - Model: {generator.config.version}")
+        with gr.Row():
+            with gr.Column():
+                prompt = gr.Textbox(
+                    label="Prompt",
+                    value='a photo of a forest with mist swirling around the tree trunks. The word "FLUX" is painted over it in big, red brush strokes with visible texture',
+                )
+                do_img2img = gr.Checkbox(
+                    label="Image to Image", value=False, interactive=not is_schnell
+                )
+                init_image = gr.Image(label="Input Image", visible=False)
+                image2image_strength = gr.Slider(
+                    0.0, 1.0, 0.8, step=0.1, label="Noising strength", visible=False
+                )
+                with gr.Accordion("Advanced Options", open=False):
+                    width = gr.Slider(128, 8192, 1152, step=16, label="Width")
+                    height = gr.Slider(128, 8192, 640, step=16, label="Height")
+                    num_steps = gr.Slider(
+                        1, 50, 4 if is_schnell else 20, step=1, label="Number of steps"
+                    )
+                    guidance = gr.Slider(
+                        1.0,
+                        10.0,
+                        3.5,
+                        step=0.1,
+                        label="Guidance",
+                        interactive=not is_schnell,
+                    )
+                    seed = gr.Textbox(-1, label="Seed (-1 for random)")
+                    add_sampling_metadata = gr.Checkbox(
+                        label="Add sampling parameters to metadata?", value=True
+                    )
+                generate_btn = gr.Button("Generate")
+            with gr.Column(min_width="960px"):
+                output_image = gr.Image(label="Generated Image")
+                seed_output = gr.Number(label="Used Seed")
+                warning_text = gr.Textbox(label="Warning", visible=False)
+        def update_img2img(do_img2img):
+            return {
+                init_image: gr.update(visible=do_img2img),
+                image2image_strength: gr.update(visible=do_img2img),
+            }
+        do_img2img.change(
+            update_img2img, do_img2img, [init_image, image2image_strength]
+        )
+        generate_btn.click(
+            fn=generate_image,
+            inputs=[
+                prompt,
+                width,
+                height,
+                num_steps,
+                guidance,
+                seed,
+                init_image,
+                image2image_strength,
+                add_sampling_metadata,
+            ],
+            outputs=[output_image, seed_output, warning_text],
+        )
+    return demo
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Flux")
+    parser.add_argument(
+        "--config", type=str, default="configs/config-dev-1-RTX6000ADA.json", help="Config file path"
+    )
+    parser.add_argument(
+        "--share", action="store_true", help="Create a public link to your demo"
+    )
+    args = parser.parse_args()
+    demo = create_demo(args.config)
+    demo.launch(share=args.share)

configs/config-dev-1-RTX6000ADA-Copy1.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qfloat8",
+  "compile_extras": true,
+  "compile_blocks": true,
+  "offload_text_encoder": false,
+  "offload_vae": false,
+  "offload_flow": false
+}

configs/config-dev-1-RTX6000ADA.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qfloat8",
+  "compile_extras": true,
+  "compile_blocks": true,
+  "offload_text_encoder": false,
+  "offload_vae": false,
+  "offload_flow": false
+}

configs/config-dev-cuda0.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "text_enc_quantization_dtype": "qfloat8",
+  "compile_extras": false,
+  "compile_blocks": false,
+  "offload_ae": false,
+  "offload_text_enc": false,
+  "offload_flow": false
+}

configs/config-dev-eval.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:1",
+  "ae_device": "cuda:1",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qfloat8",
+  "compile_extras": false,
+  "compile_blocks": false,
+  "offload_ae": false,
+  "offload_text_enc": false,
+  "offload_flow": false
+}

configs/config-dev-gigaquant.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "num_to_quant": 220,
+  "flow_quantization_dtype": "qint4",
+  "text_enc_quantization_dtype": "qint4",
+  "ae_quantization_dtype": "qint4",
+  "clip_quantization_dtype": "qint4",
+  "compile_extras": false,
+  "compile_blocks": false,
+  "quantize_extras": true
+}

configs/config-dev-offload-1-4080.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qint4",
+  "ae_quantization_dtype": "qfloat8",
+  "compile_extras": true,
+  "compile_blocks": true,
+  "offload_text_encoder": true,
+  "offload_vae": true,
+  "offload_flow": true
+}

configs/config-dev-offload-1-4090.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qint4",
+  "ae_quantization_dtype": "qfloat8",
+  "compile_extras": true,
+  "compile_blocks": true,
+  "offload_text_encoder": true,
+  "offload_vae": true,
+  "offload_flow": false
+}

configs/config-dev-offload.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qint4",
+  "ae_quantization_dtype": "qfloat8",
+  "compile_extras": false,
+  "compile_blocks": false,
+  "offload_text_encoder": true,
+  "offload_vae": true,
+  "offload_flow": true
+}

configs/config-dev-prequant.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/flux-fp16-acc/flux_fp8.safetensors",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:1",
+  "ae_device": "cuda:1",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "text_enc_quantization_dtype": "qfloat8",
+  "compile_extras": false,
+  "compile_blocks": false,
+  "prequantized_flow": true,
+  "offload_ae": false,
+  "offload_text_enc": false,
+  "offload_flow": false
+}

configs/config-dev.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:1",
+  "ae_device": "cuda:1",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "text_enc_quantization_dtype": "qfloat8",
+  "ae_quantization_dtype": "qfloat8",
+  "compile_extras": true,
+  "compile_blocks": true,
+  "offload_ae": false,
+  "offload_text_enc": false,
+  "offload_flow": false
+}

configs/config-f8.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "version": "flux-schnell",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [16, 56, 56],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": false
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [1, 2, 4, 4],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "flux1-schnell.safetensors",
+  "ae_path": "ae.safetensors",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 256,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qfloat8",
+  "compile_extras": true,
+  "compile_blocks": true,
+  "offload_text_encoder": false,
+  "offload_vae": false,
+  "offload_flow": false
+}

configs/config-schnell-cuda0.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "version": "flux-schnell",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": false
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir-schnell/flux1-schnell.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir-schnell/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-schnell",
+  "repo_flow": "flux1-schnell.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 256,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "text_enc_quantization_dtype": "qfloat8",
+  "ae_quantization_dtype": "qfloat8",
+  "compile_extras": false,
+  "compile_blocks": false,
+  "offload_ae": false,
+  "offload_text_enc": false,
+  "offload_flow": false
+}

configs/config-schnell.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "version": "flux-schnell",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": false
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir-schnell/flux1-schnell.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir-schnell/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-schnell",
+  "repo_flow": "flux1-schnell.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 256,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:1",
+  "ae_device": "cuda:1",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "text_enc_quantization_dtype": "qfloat8",
+  "ae_quantization_dtype": "qfloat8",
+  "compile_extras": true,
+  "compile_blocks": true,
+  "offload_ae": false,
+  "offload_text_enc": false,
+  "offload_flow": false
+}

dark.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c633fc7d1af2452f0680abdc20baa285c43e107ae9a32fbf995c55c13bf0c4dd
+size 39759552

f8.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "version": "flux-schnell",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [16, 56, 56],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": false
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [1, 2, 4, 4],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "flux1-schnell.safetensors",
+  "ae_path": "ae.safetensors",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qfloat8",
+  "compile_extras": true,
+  "compile_blocks": true,
+  "offload_text_encoder": false,
+  "offload_vae": false,
+  "offload_flow": false
+}

float8_quantize.py ADDED Viewed

	@@ -0,0 +1,496 @@

+from loguru import logger
+import torch
+import torch.nn as nn
+from torch.nn import init
+import math
+from torch.compiler import is_compiling
+from torch import __version__
+from torch.version import cuda
+from modules.flux_model import Modulation
+IS_TORCH_2_4 = __version__ < (2, 4, 9)
+LT_TORCH_2_4 = __version__ < (2, 4)
+if LT_TORCH_2_4:
+    if not hasattr(torch, "_scaled_mm"):
+        raise RuntimeError(
+            "This version of PyTorch is not supported. Please upgrade to PyTorch 2.4 with CUDA 12.4 or later."
+        )
+CUDA_VERSION = float(cuda) if cuda else 0
+if CUDA_VERSION < 12.4:
+    raise RuntimeError(
+        f"This version of PyTorch is not supported. Please upgrade to PyTorch 2.4 with CUDA 12.4 or later got torch version {__version__} and CUDA version {cuda}."
+    )
+try:
+    from cublas_ops import CublasLinear
+except ImportError:
+    CublasLinear = type(None)
+class F8Linear(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=torch.float16,
+        float8_dtype=torch.float8_e4m3fn,
+        float_weight: torch.Tensor = None,
+        float_bias: torch.Tensor = None,
+        num_scale_trials: int = 12,
+        input_float8_dtype=torch.float8_e5m2,
+    ) -> None:
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.float8_dtype = float8_dtype
+        self.input_float8_dtype = input_float8_dtype
+        self.input_scale_initialized = False
+        self.weight_initialized = False
+        self.max_value = torch.finfo(self.float8_dtype).max
+        self.input_max_value = torch.finfo(self.input_float8_dtype).max
+        factory_kwargs = {"dtype": dtype, "device": device}
+        if float_weight is None:
+            self.weight = nn.Parameter(
+                torch.empty((out_features, in_features), **factory_kwargs)
+            )
+        else:
+            self.weight = nn.Parameter(
+                float_weight, requires_grad=float_weight.requires_grad
+            )
+        if float_bias is None:
+            if bias:
+                self.bias = nn.Parameter(
+                    torch.empty(out_features, **factory_kwargs),
+                )
+            else:
+                self.register_parameter("bias", None)
+        else:
+            self.bias = nn.Parameter(float_bias, requires_grad=float_bias.requires_grad)
+        self.num_scale_trials = num_scale_trials
+        self.input_amax_trials = torch.zeros(
+            num_scale_trials, requires_grad=False, device=device, dtype=torch.float32
+        )
+        self.trial_index = 0
+        self.register_buffer("scale", None)
+        self.register_buffer(
+            "input_scale",
+            None,
+        )
+        self.register_buffer(
+            "float8_data",
+            None,
+        )
+        self.scale_reciprocal = self.register_buffer("scale_reciprocal", None)
+        self.input_scale_reciprocal = self.register_buffer(
+            "input_scale_reciprocal", None
+        )
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        sd = {k.replace(prefix, ""): v for k, v in state_dict.items()}
+        if "weight" in sd:
+            if (
+                "float8_data" not in sd
+                or sd["float8_data"] is None
+                and sd["weight"].shape == (self.out_features, self.in_features)
+            ):
+                # Initialize as if it's an F8Linear that needs to be quantized
+                self._parameters["weight"] = nn.Parameter(
+                    sd["weight"], requires_grad=False
+                )
+                if "bias" in sd:
+                    self._parameters["bias"] = nn.Parameter(
+                        sd["bias"], requires_grad=False
+                    )
+                self.quantize_weight()
+            elif sd["float8_data"].shape == (
+                self.out_features,
+                self.in_features,
+            ) and sd["weight"] == torch.zeros_like(sd["weight"]):
+                w = sd["weight"]
+                # Set the init values as if it's already quantized float8_data
+                self._buffers["float8_data"] = sd["float8_data"]
+                self._parameters["weight"] = nn.Parameter(
+                    torch.zeros(
+                        1,
+                        dtype=w.dtype,
+                        device=w.device,
+                        requires_grad=False,
+                    )
+                )
+                if "bias" in sd:
+                    self._parameters["bias"] = nn.Parameter(
+                        sd["bias"], requires_grad=False
+                    )
+                self.weight_initialized = True
+                # Check if scales and reciprocals are initialized
+                if all(
+                    key in sd
+                    for key in [
+                        "scale",
+                        "input_scale",
+                        "scale_reciprocal",
+                        "input_scale_reciprocal",
+                    ]
+                ):
+                    self.scale = sd["scale"].float()
+                    self.input_scale = sd["input_scale"].float()
+                    self.scale_reciprocal = sd["scale_reciprocal"].float()
+                    self.input_scale_reciprocal = sd["input_scale_reciprocal"].float()
+                    self.input_scale_initialized = True
+                    self.trial_index = self.num_scale_trials
+                elif "scale" in sd and "scale_reciprocal" in sd:
+                    self.scale = sd["scale"].float()
+                    self.input_scale = (
+                        sd["input_scale"].float() if "input_scale" in sd else None
+                    )
+                    self.scale_reciprocal = sd["scale_reciprocal"].float()
+                    self.input_scale_reciprocal = (
+                        sd["input_scale_reciprocal"].float()
+                        if "input_scale_reciprocal" in sd
+                        else None
+                    )
+                    self.input_scale_initialized = (
+                        True if "input_scale" in sd else False
+                    )
+                    self.trial_index = (
+                        self.num_scale_trials if "input_scale" in sd else 0
+                    )
+                    self.input_amax_trials = torch.zeros(
+                        self.num_scale_trials,
+                        requires_grad=False,
+                        dtype=torch.float32,
+                        device=self.weight.device,
+                    )
+                    self.input_scale_initialized = False
+                    self.trial_index = 0
+                else:
+                    # If scales are not initialized, reset trials
+                    self.input_scale_initialized = False
+                    self.trial_index = 0
+                    self.input_amax_trials = torch.zeros(
+                        self.num_scale_trials, requires_grad=False, dtype=torch.float32
+                    )
+            else:
+                raise RuntimeError(
+                    f"Weight tensor not found or has incorrect shape in state dict: {sd.keys()}"
+                )
+        else:
+            raise RuntimeError(
+                "Weight tensor not found or has incorrect shape in state dict"
+            )
+    def quantize_weight(self):
+        if self.weight_initialized:
+            return
+        amax = torch.max(torch.abs(self.weight.data)).float()
+        self.scale = self.amax_to_scale(amax, self.max_value)
+        self.float8_data = self.to_fp8_saturated(
+            self.weight.data, self.scale, self.max_value
+        ).to(self.float8_dtype)
+        self.scale_reciprocal = self.scale.reciprocal()
+        self.weight.data = torch.zeros(
+            1, dtype=self.weight.dtype, device=self.weight.device, requires_grad=False
+        )
+        self.weight_initialized = True
+    def set_weight_tensor(self, tensor: torch.Tensor):
+        self.weight.data = tensor
+        self.weight_initialized = False
+        self.quantize_weight()
+    def amax_to_scale(self, amax, max_val):
+        return (max_val / torch.clamp(amax, min=1e-12)).clamp(max=max_val)
+    def to_fp8_saturated(self, x, scale, max_val):
+        return (x * scale).clamp(-max_val, max_val)
+    def quantize_input(self, x: torch.Tensor):
+        if self.input_scale_initialized:
+            return self.to_fp8_saturated(x, self.input_scale, self.input_max_value).to(
+                self.input_float8_dtype
+            )
+        elif self.trial_index < self.num_scale_trials:
+            amax = torch.max(torch.abs(x)).float()
+            self.input_amax_trials[self.trial_index] = amax
+            self.trial_index += 1
+            self.input_scale = self.amax_to_scale(
+                self.input_amax_trials[: self.trial_index].max(), self.input_max_value
+            )
+            self.input_scale_reciprocal = self.input_scale.reciprocal()
+            return self.to_fp8_saturated(x, self.input_scale, self.input_max_value).to(
+                self.input_float8_dtype
+            )
+        else:
+            self.input_scale = self.amax_to_scale(
+                self.input_amax_trials.max(), self.input_max_value
+            )
+            self.input_scale_reciprocal = self.input_scale.reciprocal()
+            self.input_scale_initialized = True
+            return self.to_fp8_saturated(x, self.input_scale, self.input_max_value).to(
+                self.input_float8_dtype
+            )
+    def reset_parameters(self) -> None:
+        if self.weight_initialized:
+            self.weight = nn.Parameter(
+                torch.empty(
+                    (self.out_features, self.in_features),
+                    **{
+                        "dtype": self.weight.dtype,
+                        "device": self.weight.device,
+                    },
+                )
+            )
+            self.weight_initialized = False
+            self.input_scale_initialized = False
+            self.trial_index = 0
+            self.input_amax_trials.zero_()
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            init.uniform_(self.bias, -bound, bound)
+        self.quantize_weight()
+        self.max_value = torch.finfo(self.float8_dtype).max
+        self.input_max_value = torch.finfo(self.input_float8_dtype).max
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.input_scale_initialized or is_compiling():
+            x = self.to_fp8_saturated(x, self.input_scale, self.input_max_value).to(
+                self.input_float8_dtype
+            )
+        else:
+            x = self.quantize_input(x)
+        prev_dims = x.shape[:-1]
+        x = x.view(-1, self.in_features)
+        # float8 matmul, much faster than float16 matmul w/ float32 accumulate on ADA devices!
+        out = torch._scaled_mm(
+            x,
+            self.float8_data.T,
+            scale_a=self.input_scale_reciprocal,
+            scale_b=self.scale_reciprocal,
+            bias=self.bias,
+            out_dtype=self.weight.dtype,
+            use_fast_accum=True,
+        )
+        if IS_TORCH_2_4:
+            out = out[0]
+        out = out.view(*prev_dims, self.out_features)
+        return out
+    @classmethod
+    def from_linear(
+        cls,
+        linear: nn.Linear,
+        float8_dtype=torch.float8_e4m3fn,
+        input_float8_dtype=torch.float8_e5m2,
+    ) -> "F8Linear":
+        f8_lin = cls(
+            in_features=linear.in_features,
+            out_features=linear.out_features,
+            bias=linear.bias is not None,
+            device=linear.weight.device,
+            dtype=linear.weight.dtype,
+            float8_dtype=float8_dtype,
+            float_weight=linear.weight.data,
+            float_bias=(linear.bias.data if linear.bias is not None else None),
+            input_float8_dtype=input_float8_dtype,
+        )
+        f8_lin.quantize_weight()
+        return f8_lin
+@torch.inference_mode()
+def recursive_swap_linears(
+    model: nn.Module,
+    float8_dtype=torch.float8_e4m3fn,
+    input_float8_dtype=torch.float8_e5m2,
+    quantize_modulation: bool = True,
+    ignore_keys: list[str] = [],
+) -> None:
+    """
+    Recursively swaps all nn.Linear modules in the given model with F8Linear modules.
+    This function traverses the model's structure and replaces each nn.Linear
+    instance with an F8Linear instance, which uses 8-bit floating point
+    quantization for weights. The original linear layer's weights are deleted
+    after conversion to save memory.
+    Args:
+        model (nn.Module): The PyTorch model to modify.
+    Note:
+        This function modifies the model in-place. After calling this function,
+        all linear layers in the model will be using 8-bit quantization.
+    """
+    for name, child in model.named_children():
+        if name in ignore_keys:
+            continue
+        if isinstance(child, Modulation) and not quantize_modulation:
+            continue
+        if isinstance(child, nn.Linear) and not isinstance(
+            child, (F8Linear, CublasLinear)
+        ):
+            setattr(
+                model,
+                name,
+                F8Linear.from_linear(
+                    child,
+                    float8_dtype=float8_dtype,
+                    input_float8_dtype=input_float8_dtype,
+                ),
+            )
+            del child
+        else:
+            recursive_swap_linears(
+                child,
+                float8_dtype=float8_dtype,
+                input_float8_dtype=input_float8_dtype,
+                quantize_modulation=quantize_modulation,
+                ignore_keys=ignore_keys,
+            )
+@torch.inference_mode()
+def swap_to_cublaslinear(model: nn.Module):
+    if CublasLinear == type(None):
+        return
+    for name, child in model.named_children():
+        if isinstance(child, nn.Linear) and not isinstance(
+            child, (F8Linear, CublasLinear)
+        ):
+            cublas_lin = CublasLinear(
+                child.in_features,
+                child.out_features,
+                bias=child.bias is not None,
+                dtype=child.weight.dtype,
+                device=child.weight.device,
+            )
+            cublas_lin.weight.data = child.weight.clone().detach()
+            cublas_lin.bias.data = child.bias.clone().detach()
+            setattr(model, name, cublas_lin)
+            del child
+        else:
+            swap_to_cublaslinear(child)
+@torch.inference_mode()
+def quantize_flow_transformer_and_dispatch_float8(
+    flow_model: nn.Module,
+    device=torch.device("cuda"),
+    float8_dtype=torch.float8_e4m3fn,
+    input_float8_dtype=torch.float8_e5m2,
+    offload_flow=False,
+    swap_linears_with_cublaslinear=True,
+    flow_dtype=torch.float16,
+    quantize_modulation: bool = True,
+    quantize_flow_embedder_layers: bool = True,
+) -> nn.Module:
+    """
+    Quantize the flux flow transformer model (original BFL codebase version) and dispatch to the given device.
+    Iteratively pushes each module to device, evals, replaces linear layers with F8Linear except for final_layer, and quantizes.
+    Allows for fast dispatch to gpu & quantize without causing OOM on gpus with limited memory.
+    After dispatching, if offload_flow is True, offloads the model to cpu.
+    if swap_linears_with_cublaslinear is true, and flow_dtype == torch.float16, then swap all linears with cublaslinears for 2x performance boost on consumer GPUs.
+    Otherwise will skip the cublaslinear swap.
+    For added extra precision, you can set quantize_flow_embedder_layers to False,
+    this helps maintain the output quality of the flow transformer moreso than fully quantizing,
+    at the expense of ~512MB more VRAM usage.
+    For added extra precision, you can set quantize_modulation to False,
+    this helps maintain the output quality of the flow transformer moreso than fully quantizing,
+    at the expense of ~2GB more VRAM usage, but- has a much higher impact on image quality than the embedder layers.
+    """
+    for module in flow_model.double_blocks:
+        module.to(device)
+        module.eval()
+        recursive_swap_linears(
+            module,
+            float8_dtype=float8_dtype,
+            input_float8_dtype=input_float8_dtype,
+            quantize_modulation=quantize_modulation,
+        )
+        torch.cuda.empty_cache()
+    for module in flow_model.single_blocks:
+        module.to(device)
+        module.eval()
+        recursive_swap_linears(
+            module,
+            float8_dtype=float8_dtype,
+            input_float8_dtype=input_float8_dtype,
+            quantize_modulation=quantize_modulation,
+        )
+        torch.cuda.empty_cache()
+    to_gpu_extras = [
+        "vector_in",
+        "img_in",
+        "txt_in",
+        "time_in",
+        "guidance_in",
+        "final_layer",
+        "pe_embedder",
+    ]
+    for module in to_gpu_extras:
+        m_extra = getattr(flow_model, module)
+        if m_extra is None:
+            continue
+        m_extra.to(device)
+        m_extra.eval()
+        if isinstance(m_extra, nn.Linear) and not isinstance(
+            m_extra, (F8Linear, CublasLinear)
+        ):
+            if quantize_flow_embedder_layers:
+                setattr(
+                    flow_model,
+                    module,
+                    F8Linear.from_linear(
+                        m_extra,
+                        float8_dtype=float8_dtype,
+                        input_float8_dtype=input_float8_dtype,
+                    ),
+                )
+            del m_extra
+        elif module != "final_layer":
+            if quantize_flow_embedder_layers:
+                recursive_swap_linears(
+                    m_extra,
+                    float8_dtype=float8_dtype,
+                    input_float8_dtype=input_float8_dtype,
+                    quantize_modulation=quantize_modulation,
+                )
+        torch.cuda.empty_cache()
+    if (
+        swap_linears_with_cublaslinear
+        and flow_dtype == torch.float16
+        and CublasLinear != type(None)
+    ):
+        swap_to_cublaslinear(flow_model)
+    elif swap_linears_with_cublaslinear and flow_dtype != torch.float16:
+        logger.warning("Skipping cublas linear swap because flow_dtype is not float16")
+    if offload_flow:
+        flow_model.to("cpu")
+        torch.cuda.empty_cache()
+    return flow_model

flux_emphasis.py ADDED Viewed

	@@ -0,0 +1,447 @@

+from typing import TYPE_CHECKING, Optional
+from pydash import flatten
+import torch
+from transformers.models.clip.tokenization_clip import CLIPTokenizer
+from einops import repeat
+if TYPE_CHECKING:
+    from flux_pipeline import FluxPipeline
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \\( - literal character '('
+      \\[ - literal character '['
+      \\) - literal character ')'
+      \\] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\\(literal\\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+    import re
+    re_attention = re.compile(
+        r"""
+            \\\(|\\\)|\\\[|\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|
+            \)|]|[^\\()\[\]:]+|:
+        """,
+        re.X,
+    )
+    re_break = re.compile(r"\s*\bBREAK\b\s*", re.S)
+    res = []
+    round_brackets = []
+    square_brackets = []
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            parts = re.split(re_break, text)
+            for i, part in enumerate(parts):
+                if i > 0:
+                    res.append(["BREAK", -1])
+                res.append([part, 1.0])
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+    if len(res) == 0:
+        res = [["", 1.0]]
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+    return res
+def get_prompts_tokens_with_weights(
+    clip_tokenizer: CLIPTokenizer, prompt: str, debug: bool = False
+):
+    """
+    Get prompt token ids and weights, this function works for both prompt and negative prompt
+    Args:
+        pipe (CLIPTokenizer)
+            A CLIPTokenizer
+        prompt (str)
+            A prompt string with weights
+    Returns:
+        text_tokens (list)
+            A list contains token ids
+        text_weight (list)
+            A list contains the correspodent weight of token ids
+    Example:
+        import torch
+        from transformers import CLIPTokenizer
+        clip_tokenizer = CLIPTokenizer.from_pretrained(
+            "stablediffusionapi/deliberate-v2"
+            , subfolder = "tokenizer"
+            , dtype = torch.float16
+        )
+        token_id_list, token_weight_list = get_prompts_tokens_with_weights(
+            clip_tokenizer = clip_tokenizer
+            ,prompt = "a (red:1.5) cat"*70
+        )
+    """
+    texts_and_weights = parse_prompt_attention(prompt)
+    text_tokens, text_weights = [], []
+    maxlen = clip_tokenizer.model_max_length
+    for word, weight in texts_and_weights:
+        # tokenize and discard the starting and the ending token
+        token = clip_tokenizer(
+            word, truncation=False, padding=False, add_special_tokens=False
+        ).input_ids
+        # so that tokenize whatever length prompt
+        # the returned token is a 1d list: [320, 1125, 539, 320]
+        if debug:
+            print(
+                token,
+                "|FOR MODEL LEN{}|".format(maxlen),
+                clip_tokenizer.decode(
+                    token, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                ),
+            )
+        # merge the new tokens to the all tokens holder: text_tokens
+        text_tokens = [*text_tokens, *token]
+        # each token chunk will come with one weight, like ['red cat', 2.0]
+        # need to expand weight for each token.
+        chunk_weights = [weight] * len(token)
+        # append the weight back to the weight holder: text_weights
+        text_weights = [*text_weights, *chunk_weights]
+    return text_tokens, text_weights
+def group_tokens_and_weights(
+    token_ids: list,
+    weights: list,
+    pad_last_block=False,
+    bos=49406,
+    eos=49407,
+    max_length=77,
+    pad_tokens=True,
+):
+    """
+    Produce tokens and weights in groups and pad the missing tokens
+    Args:
+        token_ids (list)
+            The token ids from tokenizer
+        weights (list)
+            The weights list from function get_prompts_tokens_with_weights
+        pad_last_block (bool)
+            Control if fill the last token list to 75 tokens with eos
+    Returns:
+        new_token_ids (2d list)
+        new_weights (2d list)
+    Example:
+        token_groups,weight_groups = group_tokens_and_weights(
+            token_ids = token_id_list
+            , weights = token_weight_list
+        )
+    """
+    # TODO: Possibly need to fix this, since this doesn't seem correct.
+    # Ignoring for now since I don't know what the consequences might be
+    # if changed to <= instead of <.
+    max_len = max_length - 2 if max_length < 77 else max_length
+    # this will be a 2d list
+    new_token_ids = []
+    new_weights = []
+    while len(token_ids) >= max_len:
+        # get the first 75 tokens
+        temp_77_token_ids = [token_ids.pop(0) for _ in range(max_len)]
+        temp_77_weights = [weights.pop(0) for _ in range(max_len)]
+        # extract token ids and weights
+        if pad_tokens:
+            if bos is not None:
+                temp_77_token_ids = [bos] + temp_77_token_ids + [eos]
+                temp_77_weights = [1.0] + temp_77_weights + [1.0]
+            else:
+                temp_77_token_ids = temp_77_token_ids + [eos]
+                temp_77_weights = temp_77_weights + [1.0]
+        # add 77 token and weights chunk to the holder list
+        new_token_ids.append(temp_77_token_ids)
+        new_weights.append(temp_77_weights)
+    # padding the left
+    if len(token_ids) > 0:
+        if pad_tokens:
+            padding_len = max_len - len(token_ids) if pad_last_block else 0
+            temp_77_token_ids = [bos] + token_ids + [eos] * padding_len + [eos]
+            new_token_ids.append(temp_77_token_ids)
+            temp_77_weights = [1.0] + weights + [1.0] * padding_len + [1.0]
+            new_weights.append(temp_77_weights)
+        else:
+            new_token_ids.append(token_ids)
+            new_weights.append(weights)
+    return new_token_ids, new_weights
+def standardize_tensor(
+    input_tensor: torch.Tensor, target_mean: float, target_std: float
+) -> torch.Tensor:
+    """
+    This function standardizes an input tensor so that it has a specific mean and standard deviation.
+    Parameters:
+    input_tensor (torch.Tensor): The tensor to standardize.
+    target_mean (float): The target mean for the tensor.
+    target_std (float): The target standard deviation for the tensor.
+    Returns:
+    torch.Tensor: The standardized tensor.
+    """
+    # First, compute the mean and std of the input tensor
+    mean = input_tensor.mean()
+    std = input_tensor.std()
+    # Then, standardize the tensor to have a mean of 0 and std of 1
+    standardized_tensor = (input_tensor - mean) / std
+    # Finally, scale the tensor to the target mean and std
+    output_tensor = standardized_tensor * target_std + target_mean
+    return output_tensor
+def apply_weights(
+    prompt_tokens: torch.Tensor,
+    weight_tensor: torch.Tensor,
+    token_embedding: torch.Tensor,
+    eos_token_id: int,
+    pad_last_block: bool = True,
+) -> torch.FloatTensor:
+    mean = token_embedding.mean()
+    std = token_embedding.std()
+    if pad_last_block:
+        pooled_tensor = token_embedding[
+            torch.arange(token_embedding.shape[0], device=token_embedding.device),
+            (
+                prompt_tokens.to(dtype=torch.int, device=token_embedding.device)
+                == eos_token_id
+            )
+            .int()
+            .argmax(dim=-1),
+        ]
+    else:
+        pooled_tensor = token_embedding[:, -1]
+    for j in range(len(weight_tensor)):
+        if weight_tensor[j] != 1.0:
+            token_embedding[:, j] = (
+                pooled_tensor
+                + (token_embedding[:, j] - pooled_tensor) * weight_tensor[j]
+            )
+    return standardize_tensor(token_embedding, mean, std)
+@torch.inference_mode()
+def get_weighted_text_embeddings_flux(
+    pipe: "FluxPipeline",
+    prompt: str = "",
+    num_images_per_prompt: int = 1,
+    device: Optional[torch.device] = None,
+    target_device: Optional[torch.device] = torch.device("cuda:0"),
+    target_dtype: Optional[torch.dtype] = torch.bfloat16,
+    debug: bool = False,
+):
+    """
+    This function can process long prompt with weights, no length limitation
+    for Stable Diffusion XL
+    Args:
+        pipe (StableDiffusionPipeline)
+        prompt (str)
+        prompt_2 (str)
+        neg_prompt (str)
+        neg_prompt_2 (str)
+        num_images_per_prompt (int)
+        device (torch.device)
+    Returns:
+        prompt_embeds (torch.Tensor)
+        neg_prompt_embeds (torch.Tensor)
+    """
+    device = device or pipe._execution_device
+    eos = pipe.clip.tokenizer.eos_token_id
+    eos_2 = pipe.t5.tokenizer.eos_token_id
+    bos = pipe.clip.tokenizer.bos_token_id
+    bos_2 = pipe.t5.tokenizer.bos_token_id
+    clip = pipe.clip.hf_module
+    t5 = pipe.t5.hf_module
+    tokenizer_clip = pipe.clip.tokenizer
+    tokenizer_t5 = pipe.t5.tokenizer
+    t5_length = 512 if pipe.name == "flux-dev" else 256
+    clip_length = 77
+    # tokenizer 1
+    prompt_tokens_clip, prompt_weights_clip = get_prompts_tokens_with_weights(
+        tokenizer_clip, prompt, debug=debug
+    )
+    # tokenizer 2
+    prompt_tokens_t5, prompt_weights_t5 = get_prompts_tokens_with_weights(
+        tokenizer_t5, prompt, debug=debug
+    )
+    prompt_tokens_clip_grouped, prompt_weights_clip_grouped = group_tokens_and_weights(
+        prompt_tokens_clip,
+        prompt_weights_clip,
+        pad_last_block=True,
+        bos=bos,
+        eos=eos,
+        max_length=clip_length,
+    )
+    prompt_tokens_t5_grouped, prompt_weights_t5_grouped = group_tokens_and_weights(
+        prompt_tokens_t5,
+        prompt_weights_t5,
+        pad_last_block=True,
+        bos=bos_2,
+        eos=eos_2,
+        max_length=t5_length,
+        pad_tokens=False,
+    )
+    prompt_tokens_t5 = flatten(prompt_tokens_t5_grouped)
+    prompt_weights_t5 = flatten(prompt_weights_t5_grouped)
+    prompt_tokens_clip = flatten(prompt_tokens_clip_grouped)
+    prompt_weights_clip = flatten(prompt_weights_clip_grouped)
+    prompt_tokens_clip = tokenizer_clip.decode(
+        prompt_tokens_clip, skip_special_tokens=True, clean_up_tokenization_spaces=True
+    )
+    prompt_tokens_clip = tokenizer_clip(
+        prompt_tokens_clip,
+        add_special_tokens=True,
+        padding="max_length",
+        truncation=True,
+        max_length=clip_length,
+        return_tensors="pt",
+    ).input_ids.to(device)
+    prompt_tokens_t5 = tokenizer_t5.decode(
+        prompt_tokens_t5, skip_special_tokens=True, clean_up_tokenization_spaces=True
+    )
+    prompt_tokens_t5 = tokenizer_t5(
+        prompt_tokens_t5,
+        add_special_tokens=True,
+        padding="max_length",
+        truncation=True,
+        max_length=t5_length,
+        return_tensors="pt",
+    ).input_ids.to(device)
+    prompt_weights_t5 = torch.cat(
+        [
+            torch.tensor(prompt_weights_t5, dtype=torch.float32),
+            torch.full(
+                (t5_length - torch.tensor(prompt_weights_t5).numel(),),
+                1.0,
+                dtype=torch.float32,
+            ),
+        ],
+        dim=0,
+    ).to(device)
+    clip_embeds = clip(
+        prompt_tokens_clip, output_hidden_states=True, attention_mask=None
+    )["pooler_output"]
+    if clip_embeds.shape[0] == 1 and num_images_per_prompt > 1:
+        clip_embeds = repeat(clip_embeds, "1 ... -> bs ...", bs=num_images_per_prompt)
+    weight_tensor_t5 = torch.tensor(
+        flatten(prompt_weights_t5), dtype=torch.float32, device=device
+    )
+    t5_embeds = t5(prompt_tokens_t5, output_hidden_states=True, attention_mask=None)[
+        "last_hidden_state"
+    ]
+    t5_embeds = apply_weights(prompt_tokens_t5, weight_tensor_t5, t5_embeds, eos_2)
+    if debug:
+        print(t5_embeds.shape)
+    if t5_embeds.shape[0] == 1 and num_images_per_prompt > 1:
+        t5_embeds = repeat(t5_embeds, "1 ... -> bs ...", bs=num_images_per_prompt)
+    txt_ids = torch.zeros(
+        num_images_per_prompt,
+        t5_embeds.shape[1],
+        3,
+        device=target_device,
+        dtype=target_dtype,
+    )
+    t5_embeds = t5_embeds.to(target_device, dtype=target_dtype)
+    clip_embeds = clip_embeds.to(target_device, dtype=target_dtype)
+    return (
+        clip_embeds,
+        t5_embeds,
+        txt_ids,
+    )

flux_pipeline.py ADDED Viewed

	@@ -0,0 +1,729 @@

+import io
+import math
+import random
+import warnings
+from typing import TYPE_CHECKING, Callable, List, Optional, OrderedDict, Union
+import numpy as np
+from PIL import Image
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+import torch
+from einops import rearrange
+from flux_emphasis import get_weighted_text_embeddings_flux
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.backends.cudnn.benchmark = True
+torch.backends.cudnn.benchmark_limit = 20
+torch.set_float32_matmul_precision("high")
+from pybase64 import standard_b64decode
+from torch._dynamo import config
+from torch._inductor import config as ind_config
+config.cache_size_limit = 10000000000
+ind_config.shape_padding = True
+import platform
+from loguru import logger
+from torchvision.transforms import functional as TF
+from tqdm import tqdm
+import lora_loading
+from image_encoder import ImageEncoder
+from util import (
+    ModelSpec,
+    ModelVersion,
+    into_device,
+    into_dtype,
+    load_config_from_path,
+    load_models_from_config,
+)
+if platform.system() == "Windows":
+    MAX_RAND = 2**16 - 1
+else:
+    MAX_RAND = 2**32 - 1
+if TYPE_CHECKING:
+    from modules.autoencoder import AutoEncoder
+    from modules.conditioner import HFEmbedder
+    from modules.flux_model import Flux
+class FluxPipeline:
+    """
+    FluxPipeline is a class that provides a pipeline for generating images using the Flux model.
+    It handles input preparation, timestep generation, noise generation, device management
+    and model compilation.
+    """
+    def __init__(
+        self,
+        name: str,
+        offload: bool = False,
+        clip: "HFEmbedder" = None,
+        t5: "HFEmbedder" = None,
+        model: "Flux" = None,
+        ae: "AutoEncoder" = None,
+        dtype: torch.dtype = torch.float16,
+        verbose: bool = False,
+        flux_device: torch.device | str = "cuda:0",
+        ae_device: torch.device | str = "cuda:1",
+        clip_device: torch.device | str = "cuda:1",
+        t5_device: torch.device | str = "cuda:1",
+        config: ModelSpec = None,
+        debug: bool = False,
+    ):
+        """
+        Initialize the FluxPipeline class.
+        This class is responsible for preparing input tensors for the Flux model, generating
+        timesteps and noise, and handling device management for model offloading.
+        """
+        if config is None:
+            raise ValueError("ModelSpec config is required!")
+        self.debug = debug
+        self.name = name
+        self.device_flux = into_device(flux_device)
+        self.device_ae = into_device(ae_device)
+        self.device_clip = into_device(clip_device)
+        self.device_t5 = into_device(t5_device)
+        self.dtype = into_dtype(dtype)
+        self.offload = offload
+        self.clip: "HFEmbedder" = clip
+        self.t5: "HFEmbedder" = t5
+        self.model: "Flux" = model
+        self.ae: "AutoEncoder" = ae
+        self.rng = torch.Generator(device="cpu")
+        self.img_encoder = ImageEncoder()
+        self.verbose = verbose
+        self.ae_dtype = torch.bfloat16
+        self.config = config
+        self.offload_text_encoder = config.offload_text_encoder
+        self.offload_vae = config.offload_vae
+        self.offload_flow = config.offload_flow
+        # If models are not offloaded, move them to the appropriate devices
+        if not self.offload_flow:
+            self.model.to(self.device_flux)
+        if not self.offload_vae:
+            self.ae.to(self.device_ae)
+        if not self.offload_text_encoder:
+            self.clip.to(self.device_clip)
+            self.t5.to(self.device_t5)
+        # compile the model if needed
+        if config.compile_blocks or config.compile_extras:
+            self.compile()
+    def set_seed(
+        self, seed: int | None = None, seed_globally: bool = False
+    ) -> torch.Generator:
+        if isinstance(seed, (int, float)):
+            seed = int(abs(seed)) % MAX_RAND
+            cuda_generator = torch.Generator("cuda").manual_seed(seed)
+        elif isinstance(seed, str):
+            try:
+                seed = abs(int(seed)) % MAX_RAND
+            except Exception as e:
+                logger.warning(
+                    f"Recieved string representation of seed, but was not able to convert to int: {seed}, using random seed"
+                )
+                seed = abs(self.rng.seed()) % MAX_RAND
+            cuda_generator = torch.Generator("cuda").manual_seed(seed)
+        else:
+            seed = abs(self.rng.seed()) % MAX_RAND
+            cuda_generator = torch.Generator("cuda").manual_seed(seed)
+        if seed_globally:
+            torch.cuda.manual_seed_all(seed)
+            np.random.seed(seed)
+            random.seed(seed)
+        return cuda_generator, seed
+    def load_lora(
+        self,
+        lora_path: Union[str, OrderedDict[str, torch.Tensor]],
+        scale: float,
+        name: Optional[str] = None,
+    ):
+        """
+        Loads a LoRA checkpoint into the Flux flow transformer.
+        Currently supports LoRA checkpoints from either diffusers checkpoints which usually start with transformer.[...],
+        or loras which contain keys which start with lora_unet_[...].
+        Args:
+            lora_path (str | OrderedDict[str, torch.Tensor]): Path to the LoRA checkpoint or an ordered dictionary containing the LoRA weights.
+            scale (float): Scaling factor for the LoRA weights.
+            name (str): Name of the LoRA checkpoint, optionally can be left as None, since it only acts as an identifier.
+        """
+        self.model.load_lora(path=lora_path, scale=scale, name=name)
+    def unload_lora(self, path_or_identifier: str):
+        """
+        Unloads the LoRA checkpoint from the Flux flow transformer.
+        Args:
+            path_or_identifier (str): Path to the LoRA checkpoint or the name given to the LoRA checkpoint when it was loaded.
+        """
+        self.model.unload_lora(path_or_identifier=path_or_identifier)
+    @torch.inference_mode()
+    def compile(self):
+        """
+        Compiles the model and extras.
+        First, if:
+        - A) Checkpoint which already has float8 quantized weights and tuned input scales.
+        In which case, it will not run warmups since it assumes the input scales are already tuned.
+        - B) Checkpoint which has not been quantized, in which  case it will be quantized
+        and the input scales will be tuned. via running a warmup loop.
+            - If the model is flux-schnell, it will run 3 warmup loops since each loop is 4 steps.
+            - If the model is flux-dev, it will run 1 warmup loop for 12 steps.
+        """
+        # Run warmups if the checkpoint is not prequantized
+        if not self.config.prequantized_flow:
+            logger.info("Running warmups for compile...")
+            warmup_dict = dict(
+                prompt="A beautiful test image used to solidify the fp8 nn.Linear input scales prior to compilation 😉",
+                height=768,
+                width=768,
+                num_steps=12,
+                guidance=3.5,
+                seed=10,
+            )
+            if self.config.version == ModelVersion.flux_schnell:
+                warmup_dict["num_steps"] = 4
+                for _ in range(3):
+                    self.generate(**warmup_dict)
+            else:
+                self.generate(**warmup_dict)
+        # Compile the model and extras
+        to_gpu_extras = [
+            "vector_in",
+            "img_in",
+            "txt_in",
+            "time_in",
+            "guidance_in",
+            "final_layer",
+            "pe_embedder",
+        ]
+        if self.config.compile_blocks:
+            for block in self.model.double_blocks:
+                block.compile()
+            for block in self.model.single_blocks:
+                block.compile()
+        if self.config.compile_extras:
+            for extra in to_gpu_extras:
+                getattr(self.model, extra).compile()
+    @torch.inference_mode()
+    def prepare(
+        self,
+        img: torch.Tensor,
+        prompt: str | list[str],
+        target_device: torch.device = torch.device("cuda:0"),
+        target_dtype: torch.dtype = torch.float16,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Prepare input tensors for the Flux model.
+        This function processes the input image and text prompt, converting them into
+        the appropriate format and embedding representations required by the model.
+        Args:
+            img (torch.Tensor): Input image tensor of shape (batch_size, channels, height, width).
+            prompt (str | list[str]): Text prompt or list of prompts guiding the image generation.
+            target_device (torch.device, optional): The target device for the output tensors.
+                Defaults to torch.device("cuda:0").
+            target_dtype (torch.dtype, optional): The target data type for the output tensors.
+                Defaults to torch.float16.
+        Returns:
+            tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: A tuple containing:
+                - img: Processed image tensor.
+                - img_ids: Image position IDs.
+                - vec: Clip text embedding vector.
+                - txt: T5 text embedding hidden states.
+                - txt_ids: Text position IDs.
+        Note:
+            This function handles the necessary device management for text encoder offloading
+            if enabled in the configuration.
+        """
+        bs, c, h, w = img.shape
+        if bs == 1 and not isinstance(prompt, str):
+            bs = len(prompt)
+        img = img.unfold(2, 2, 2).unfold(3, 2, 2).permute(0, 2, 3, 1, 4, 5)
+        img = img.reshape(img.shape[0], -1, img.shape[3] * img.shape[4] * img.shape[5])
+        assert img.shape == (
+            bs,
+            (h // 2) * (w // 2),
+            c * 2 * 2,
+        ), f"{img.shape} != {(bs, (h//2)*(w//2), c*2*2)}"
+        if img.shape[0] == 1 and bs > 1:
+            img = img[None].repeat_interleave(bs, dim=0)
+        img_ids = torch.zeros(
+            h // 2, w // 2, 3, device=target_device, dtype=target_dtype
+        )
+        img_ids[..., 1] = (
+            img_ids[..., 1]
+            + torch.arange(h // 2, device=target_device, dtype=target_dtype)[:, None]
+        )
+        img_ids[..., 2] = (
+            img_ids[..., 2]
+            + torch.arange(w // 2, device=target_device, dtype=target_dtype)[None, :]
+        )
+        img_ids = img_ids[None].repeat(bs, 1, 1, 1).flatten(1, 2)
+        if self.offload_text_encoder:
+            self.clip.to(device=self.device_clip)
+            self.t5.to(device=self.device_t5)
+        # get the text embeddings
+        vec, txt, txt_ids = get_weighted_text_embeddings_flux(
+            self,
+            prompt,
+            num_images_per_prompt=bs,
+            device=self.device_clip,
+            target_device=target_device,
+            target_dtype=target_dtype,
+            debug=self.debug,
+        )
+        # offload text encoder to cpu if needed
+        if self.offload_text_encoder:
+            self.clip.to("cpu")
+            self.t5.to("cpu")
+            torch.cuda.empty_cache()
+        return img, img_ids, vec, txt, txt_ids
+    @torch.inference_mode()
+    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+    def get_lin_function(
+        self, x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
+    ) -> Callable[[float], float]:
+        m = (y2 - y1) / (x2 - x1)
+        b = y1 - m * x1
+        return lambda x: m * x + b
+    @torch.inference_mode()
+    def get_schedule(
+        self,
+        num_steps: int,
+        image_seq_len: int,
+        base_shift: float = 0.5,
+        max_shift: float = 1.15,
+        shift: bool = True,
+    ) -> list[float]:
+        """Generates a schedule of timesteps for the given number of steps and image sequence length."""
+        # extra step for zero
+        timesteps = torch.linspace(1, 0, num_steps + 1)
+        # shifting the schedule to favor high timesteps for higher signal images
+        if shift:
+            # eastimate mu based on linear estimation between two points
+            mu = self.get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+            timesteps = self.time_shift(mu, 1.0, timesteps)
+        return timesteps.tolist()
+    @torch.inference_mode()
+    def get_noise(
+        self,
+        num_samples: int,
+        height: int,
+        width: int,
+        generator: torch.Generator,
+        dtype=None,
+        device=None,
+    ) -> torch.Tensor:
+        """Generates a latent noise tensor of the given shape and dtype on the given device."""
+        if device is None:
+            device = self.device_flux
+        if dtype is None:
+            dtype = self.dtype
+        return torch.randn(
+            num_samples,
+            16,
+            # allow for packing
+            2 * math.ceil(height / 16),
+            2 * math.ceil(width / 16),
+            device=device,
+            dtype=dtype,
+            generator=generator,
+            requires_grad=False,
+        )
+    @torch.inference_mode()
+    def into_bytes(self, x: torch.Tensor, jpeg_quality: int = 99) -> io.BytesIO:
+        """Converts the image tensor to bytes."""
+        # bring into PIL format and save
+        num_images = x.shape[0]
+        images: List[torch.Tensor] = []
+        for i in range(num_images):
+            x = (
+                x[i]
+                .clamp(-1, 1)
+                .add(1.0)
+                .mul(127.5)
+                .clamp(0, 255)
+                .contiguous()
+                .type(torch.uint8)
+            )
+            images.append(x)
+        if len(images) == 1:
+            im = images[0]
+        else:
+            im = torch.vstack(images)
+        im = self.img_encoder.encode_torch(im, quality=jpeg_quality)
+        images.clear()
+        return im
+    @torch.inference_mode()
+    def load_init_image_if_needed(
+        self, init_image: torch.Tensor | str | Image.Image | np.ndarray
+    ) -> torch.Tensor:
+        """
+        Loads the initial image if it is a string, numpy array, or PIL.Image,
+        if torch.Tensor, expects it to be in the correct format and returns it as is.
+        """
+        if isinstance(init_image, str):
+            try:
+                init_image = Image.open(init_image)
+            except Exception as e:
+                init_image = Image.open(
+                    io.BytesIO(standard_b64decode(init_image.split(",")[-1]))
+                )
+            init_image = torch.from_numpy(np.array(init_image)).type(torch.uint8)
+        elif isinstance(init_image, np.ndarray):
+            init_image = torch.from_numpy(init_image).type(torch.uint8)
+        elif isinstance(init_image, Image.Image):
+            init_image = torch.from_numpy(np.array(init_image)).type(torch.uint8)
+        return init_image
+    @torch.inference_mode()
+    def vae_decode(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """Decodes the latent tensor to the pixel space."""
+        if self.offload_vae:
+            self.ae.to(self.device_ae)
+            x = x.to(self.device_ae)
+        else:
+            x = x.to(self.device_ae)
+        x = self.unpack(x.float(), height, width)
+        with torch.autocast(
+            device_type=self.device_ae.type, dtype=torch.bfloat16, cache_enabled=False
+        ):
+            x = self.ae.decode(x)
+        if self.offload_vae:
+            self.ae.to("cpu")
+            torch.cuda.empty_cache()
+        return x
+    def unpack(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        return rearrange(
+            x,
+            "b (h w) (c ph pw) -> b c (h ph) (w pw)",
+            h=math.ceil(height / 16),
+            w=math.ceil(width / 16),
+            ph=2,
+            pw=2,
+        )
+    @torch.inference_mode()
+    def resize_center_crop(
+        self, img: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
+        """Resizes and crops the image to the given height and width."""
+        img = TF.resize(img, min(width, height))
+        img = TF.center_crop(img, (height, width))
+        return img
+    @torch.inference_mode()
+    def preprocess_latent(
+        self,
+        init_image: torch.Tensor | np.ndarray = None,
+        height: int = 720,
+        width: int = 1024,
+        num_steps: int = 20,
+        strength: float = 1.0,
+        generator: torch.Generator = None,
+        num_images: int = 1,
+    ) -> tuple[torch.Tensor, List[float]]:
+        """
+        Preprocesses the latent tensor for the given number of steps and image sequence length.
+        Also, if an initial image is provided, it is vae encoded and injected with the appropriate noise
+        given the strength and number of steps replacing the latent tensor.
+        """
+        # prepare input
+        if init_image is not None:
+            if isinstance(init_image, np.ndarray):
+                init_image = torch.from_numpy(init_image)
+            init_image = (
+                init_image.permute(2, 0, 1)
+                .contiguous()
+                .to(self.device_ae, dtype=self.ae_dtype)
+                .div(127.5)
+                .sub(1)[None, ...]
+            )
+            init_image = self.resize_center_crop(init_image, height, width)
+            with torch.autocast(
+                device_type=self.device_ae.type,
+                dtype=torch.bfloat16,
+                cache_enabled=False,
+            ):
+                if self.offload_vae:
+                    self.ae.to(self.device_ae)
+                init_image = (
+                    self.ae.encode(init_image)
+                    .to(dtype=self.dtype, device=self.device_flux)
+                    .repeat(num_images, 1, 1, 1)
+                )
+                if self.offload_vae:
+                    self.ae.to("cpu")
+                    torch.cuda.empty_cache()
+        x = self.get_noise(
+            num_images,
+            height,
+            width,
+            device=self.device_flux,
+            dtype=self.dtype,
+            generator=generator,
+        )
+        timesteps = self.get_schedule(
+            num_steps=num_steps,
+            image_seq_len=x.shape[-1] * x.shape[-2] // 4,
+            shift=(self.name != "flux-schnell"),
+        )
+        if init_image is not None:
+            t_idx = int((1 - strength) * num_steps)
+            t = timesteps[t_idx]
+            timesteps = timesteps[t_idx:]
+            x = t * x + (1.0 - t) * init_image
+        return x, timesteps
+    @torch.inference_mode()
+    def generate(
+        self,
+        prompt: str,
+        width: int = 720,
+        height: int = 1024,
+        num_steps: int = 24,
+        guidance: float = 3.5,
+        seed: int | None = None,
+        init_image: torch.Tensor | str | Image.Image | np.ndarray | None = None,
+        strength: float = 1.0,
+        silent: bool = False,
+        num_images: int = 1,
+        return_seed: bool = False,
+        jpeg_quality: int = 99,
+    ) -> io.BytesIO:
+        """
+        Generate images based on the given prompt and parameters.
+        Args:
+            prompt `(str)`: The text prompt to guide the image generation.
+            width `(int, optional)`: Width of the generated image. Defaults to 720.
+            height `(int, optional)`: Height of the generated image. Defaults to 1024.
+            num_steps `(int, optional)`: Number of denoising steps. Defaults to 24.
+            guidance `(float, optional)`: Guidance scale for text-to-image generation. Defaults to 3.5.
+            seed `(int | None, optional)`: Random seed for reproducibility. If None, a random seed is used. Defaults to None.
+            init_image `(torch.Tensor | str | Image.Image | np.ndarray | None, optional)`: Initial image for image-to-image generation. Defaults to None.
+                -- note: if the image's height/width do not match the height/width of the generated image, the image is resized and centered cropped to match the height/width arguments.
+                -- If a string is provided, it is assumed to be either a path to an image file or a base64 encoded image.
+                -- If a numpy array is provided, it is assumed to be an RGB numpy array of shape (height, width, 3) and dtype uint8.
+                -- If a PIL.Image is provided, it is assumed to be an RGB PIL.Image.
+                -- If a torch.Tensor is provided, it is assumed to be a torch.Tensor of shape (height, width, 3) and dtype uint8 with range [0, 255].
+            strength `(float, optional)`: Strength of the init_image in image-to-image generation. Defaults to 1.0.
+            silent `(bool, optional)`: If True, suppresses progress bar. Defaults to False.
+            num_images `(int, optional)`: Number of images to generate. Defaults to 1.
+            return_seed `(bool, optional)`: If True, returns the seed along with the generated image. Defaults to False.
+            jpeg_quality `(int, optional)`: Quality of the JPEG compression. Defaults to 99.
+        Returns:
+            io.BytesIO: Generated image(s) in bytes format.
+            int: Seed used for generation (only if return_seed is True).
+        """
+        num_steps = 4 if self.name == "flux-schnell" else num_steps
+        init_image = self.load_init_image_if_needed(init_image)
+        # allow for packing and conversion to latent space
+        height = 16 * (height // 16)
+        width = 16 * (width // 16)
+        generator, seed = self.set_seed(seed)
+        if not silent:
+            logger.info(f"Generating with:\nSeed: {seed}\nPrompt: {prompt}")
+        # preprocess the latent
+        img, timesteps = self.preprocess_latent(
+            init_image=init_image,
+            height=height,
+            width=width,
+            num_steps=num_steps,
+            strength=strength,
+            generator=generator,
+            num_images=num_images,
+        )
+        # prepare inputs
+        img, img_ids, vec, txt, txt_ids = map(
+            lambda x: x.contiguous(),
+            self.prepare(
+                img=img,
+                prompt=prompt,
+                target_device=self.device_flux,
+                target_dtype=self.dtype,
+            ),
+        )
+        # this is ignored for schnell
+        guidance_vec = torch.full(
+            (img.shape[0],), guidance, device=self.device_flux, dtype=self.dtype
+        )
+        t_vec = None
+        # dispatch to gpu if offloaded
+        if self.offload_flow:
+            self.model.to(self.device_flux)
+        # perform the denoising loop
+        for t_curr, t_prev in tqdm(
+            zip(timesteps[:-1], timesteps[1:]), total=len(timesteps) - 1, disable=silent
+        ):
+            if t_vec is None:
+                t_vec = torch.full(
+                    (img.shape[0],),
+                    t_curr,
+                    dtype=self.dtype,
+                    device=self.device_flux,
+                )
+            else:
+                t_vec = t_vec.reshape((img.shape[0],)).fill_(t_curr)
+            pred = self.model.forward(
+                img=img,
+                img_ids=img_ids,
+                txt=txt,
+                txt_ids=txt_ids,
+                y=vec,
+                timesteps=t_vec,
+                guidance=guidance_vec,
+            )
+            img = img + (t_prev - t_curr) * pred
+        # offload the model to cpu if needed
+        if self.offload_flow:
+            self.model.to("cpu")
+            torch.cuda.empty_cache()
+        # decode latents to pixel space
+        img = self.vae_decode(img, height, width)
+        if return_seed:
+            return self.into_bytes(img, jpeg_quality=jpeg_quality), seed
+        return self.into_bytes(img, jpeg_quality=jpeg_quality)
+    @classmethod
+    def load_pipeline_from_config_path(
+        cls, path: str, flow_model_path: str = None, debug: bool = False, **kwargs
+    ) -> "FluxPipeline":
+        with torch.inference_mode():
+            config = load_config_from_path(path)
+            if flow_model_path:
+                config.ckpt_path = flow_model_path
+            for k, v in kwargs.items():
+                if hasattr(config, k):
+                    logger.info(
+                        f"Overriding config {k}:{getattr(config, k)} with value {v}"
+                    )
+                    setattr(config, k, v)
+            return cls.load_pipeline_from_config(config, debug=debug)
+    @classmethod
+    def load_pipeline_from_config(
+        cls, config: ModelSpec, debug: bool = False
+    ) -> "FluxPipeline":
+        from float8_quantize import quantize_flow_transformer_and_dispatch_float8
+        with torch.inference_mode():
+            if debug:
+                logger.info(
+                    f"Loading as prequantized flow transformer? {config.prequantized_flow}"
+                )
+            models = load_models_from_config(config)
+            config = models.config
+            flux_device = into_device(config.flux_device)
+            ae_device = into_device(config.ae_device)
+            clip_device = into_device(config.text_enc_device)
+            t5_device = into_device(config.text_enc_device)
+            flux_dtype = into_dtype(config.flow_dtype)
+            flow_model = models.flow
+            if not config.prequantized_flow:
+                flow_model = quantize_flow_transformer_and_dispatch_float8(
+                    flow_model,
+                    flux_device,
+                    offload_flow=config.offload_flow,
+                    swap_linears_with_cublaslinear=flux_dtype == torch.float16,
+                    flow_dtype=flux_dtype,
+                    quantize_modulation=config.quantize_modulation,
+                    quantize_flow_embedder_layers=config.quantize_flow_embedder_layers,
+                )
+            else:
+                flow_model.eval().requires_grad_(False)
+        return cls(
+            name=config.version,
+            clip=models.clip,
+            t5=models.t5,
+            model=flow_model,
+            ae=models.ae,
+            dtype=flux_dtype,
+            verbose=False,
+            flux_device=flux_device,
+            ae_device=ae_device,
+            clip_device=clip_device,
+            t5_device=t5_device,
+            config=config,
+            debug=debug,
+        )

image_encoder.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import io
+from PIL import Image
+import numpy as np
+import torch
+class ImageEncoder:
+    @torch.inference_mode()
+    def encode_torch(self, img: torch.Tensor, quality=95):
+        if img.ndim == 2:
+            img = (
+                img[None]
+                .repeat_interleave(3, dim=0)
+                .permute(1, 2, 0)
+                .contiguous()
+                .clamp(0, 255)
+                .type(torch.uint8)
+            )
+        elif img.ndim == 3:
+            if img.shape[0] == 3:
+                img = img.permute(1, 2, 0).contiguous().clamp(0, 255).type(torch.uint8)
+            elif img.shape[2] == 3:
+                img = img.contiguous().clamp(0, 255).type(torch.uint8)
+            else:
+                raise ValueError(f"Unsupported image shape: {img.shape}")
+        else:
+            raise ValueError(f"Unsupported image num dims: {img.ndim}")
+        img = img.cpu().numpy().astype(np.uint8)
+        im = Image.fromarray(img)
+        iob = io.BytesIO()
+        im.save(iob, format="JPEG", quality=quality)
+        iob.seek(0)
+        return iob

lora_loading.py ADDED Viewed

	@@ -0,0 +1,753 @@

+import re
+from typing import Optional, OrderedDict, Tuple, TypeAlias, Union
+import torch
+from loguru import logger
+from safetensors.torch import load_file
+from tqdm import tqdm
+from torch import nn
+try:
+    from cublas_ops import CublasLinear
+except Exception as e:
+    CublasLinear = type(None)
+from float8_quantize import F8Linear
+from modules.flux_model import Flux
+path_regex = re.compile(r"/|\\")
+StateDict: TypeAlias = OrderedDict[str, torch.Tensor]
+class LoraWeights:
+    def __init__(
+        self,
+        weights: StateDict,
+        path: str,
+        name: str = None,
+        scale: float = 1.0,
+    ) -> None:
+        self.path = path
+        self.weights = weights
+        self.name = name if name else path_regex.split(path)[-1]
+        self.scale = scale
+def swap_scale_shift(weight):
+    scale, shift = weight.chunk(2, dim=0)
+    new_weight = torch.cat([shift, scale], dim=0)
+    return new_weight
+def check_if_lora_exists(state_dict, lora_name):
+    subkey = lora_name.split(".lora_A")[0].split(".lora_B")[0].split(".weight")[0]
+    for key in state_dict.keys():
+        if subkey in key:
+            return subkey
+    return False
+def convert_if_lora_exists(new_state_dict, state_dict, lora_name, flux_layer_name):
+    if (original_stubkey := check_if_lora_exists(state_dict, lora_name)) != False:
+        weights_to_pop = [k for k in state_dict.keys() if original_stubkey in k]
+        for key in weights_to_pop:
+            key_replacement = key.replace(
+                original_stubkey, flux_layer_name.replace(".weight", "")
+            )
+            new_state_dict[key_replacement] = state_dict.pop(key)
+        return new_state_dict, state_dict
+    else:
+        return new_state_dict, state_dict
+def convert_diffusers_to_flux_transformer_checkpoint(
+    diffusers_state_dict,
+    num_layers,
+    num_single_layers,
+    has_guidance=True,
+    prefix="",
+):
+    original_state_dict = {}
+    # time_text_embed.timestep_embedder -> time_in
+    original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+        original_state_dict,
+        diffusers_state_dict,
+        f"{prefix}time_text_embed.timestep_embedder.linear_1.weight",
+        "time_in.in_layer.weight",
+    )
+    # time_text_embed.text_embedder -> vector_in
+    original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+        original_state_dict,
+        diffusers_state_dict,
+        f"{prefix}time_text_embed.text_embedder.linear_1.weight",
+        "vector_in.in_layer.weight",
+    )
+    original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+        original_state_dict,
+        diffusers_state_dict,
+        f"{prefix}time_text_embed.text_embedder.linear_2.weight",
+        "vector_in.out_layer.weight",
+    )
+    if has_guidance:
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}time_text_embed.guidance_embedder.linear_1.weight",
+            "guidance_in.in_layer.weight",
+        )
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}time_text_embed.guidance_embedder.linear_2.weight",
+            "guidance_in.out_layer.weight",
+        )
+    # context_embedder -> txt_in
+    original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+        original_state_dict,
+        diffusers_state_dict,
+        f"{prefix}context_embedder.weight",
+        "txt_in.weight",
+    )
+    # x_embedder -> img_in
+    original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+        original_state_dict,
+        diffusers_state_dict,
+        f"{prefix}x_embedder.weight",
+        "img_in.weight",
+    )
+    # double transformer blocks
+    for i in range(num_layers):
+        block_prefix = f"transformer_blocks.{i}."
+        # norms
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}{block_prefix}norm1.linear.weight",
+            f"double_blocks.{i}.img_mod.lin.weight",
+        )
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}{block_prefix}norm1_context.linear.weight",
+            f"double_blocks.{i}.txt_mod.lin.weight",
+        )
+        # Q, K, V
+        temp_dict = {}
+        expected_shape_qkv_a = None
+        expected_shape_qkv_b = None
+        expected_shape_add_qkv_a = None
+        expected_shape_add_qkv_b = None
+        dtype = None
+        device = None
+        for component in [
+            "to_q",
+            "to_k",
+            "to_v",
+            "add_q_proj",
+            "add_k_proj",
+            "add_v_proj",
+        ]:
+            sample_component_A_key = (
+                f"{prefix}{block_prefix}attn.{component}.lora_A.weight"
+            )
+            sample_component_B_key = (
+                f"{prefix}{block_prefix}attn.{component}.lora_B.weight"
+            )
+            if (
+                sample_component_A_key in diffusers_state_dict
+                and sample_component_B_key in diffusers_state_dict
+            ):
+                sample_component_A = diffusers_state_dict.pop(sample_component_A_key)
+                sample_component_B = diffusers_state_dict.pop(sample_component_B_key)
+                temp_dict[f"{component}"] = [sample_component_A, sample_component_B]
+                if expected_shape_qkv_a is None and not component.startswith("add_"):
+                    expected_shape_qkv_a = sample_component_A.shape
+                    expected_shape_qkv_b = sample_component_B.shape
+                    dtype = sample_component_A.dtype
+                    device = sample_component_A.device
+                if expected_shape_add_qkv_a is None and component.startswith("add_"):
+                    expected_shape_add_qkv_a = sample_component_A.shape
+                    expected_shape_add_qkv_b = sample_component_B.shape
+                    dtype = sample_component_A.dtype
+                    device = sample_component_A.device
+            else:
+                logger.info(
+                    f"Skipping layer {i} since no LoRA weight is available for {sample_component_A_key}"
+                )
+                temp_dict[f"{component}"] = [None, None]
+        if device is not None:
+            if expected_shape_qkv_a is not None:
+                if (sq := temp_dict["to_q"])[0] is not None:
+                    sample_q_A, sample_q_B = sq
+                else:
+                    sample_q_A, sample_q_B = [
+                        torch.zeros(expected_shape_qkv_a, dtype=dtype, device=device),
+                        torch.zeros(expected_shape_qkv_b, dtype=dtype, device=device),
+                    ]
+                if (sq := temp_dict["to_k"])[0] is not None:
+                    sample_k_A, sample_k_B = sq
+                else:
+                    sample_k_A, sample_k_B = [
+                        torch.zeros(expected_shape_qkv_a, dtype=dtype, device=device),
+                        torch.zeros(expected_shape_qkv_b, dtype=dtype, device=device),
+                    ]
+                if (sq := temp_dict["to_v"])[0] is not None:
+                    sample_v_A, sample_v_B = sq
+                else:
+                    sample_v_A, sample_v_B = [
+                        torch.zeros(expected_shape_qkv_a, dtype=dtype, device=device),
+                        torch.zeros(expected_shape_qkv_b, dtype=dtype, device=device),
+                    ]
+                original_state_dict[f"double_blocks.{i}.img_attn.qkv.lora_A.weight"] = (
+                    torch.cat([sample_q_A, sample_k_A, sample_v_A], dim=0)
+                )
+                original_state_dict[f"double_blocks.{i}.img_attn.qkv.lora_B.weight"] = (
+                    torch.cat([sample_q_B, sample_k_B, sample_v_B], dim=0)
+                )
+            if expected_shape_add_qkv_a is not None:
+                if (sq := temp_dict["add_q_proj"])[0] is not None:
+                    context_q_A, context_q_B = sq
+                else:
+                    context_q_A, context_q_B = [
+                        torch.zeros(
+                            expected_shape_add_qkv_a, dtype=dtype, device=device
+                        ),
+                        torch.zeros(
+                            expected_shape_add_qkv_b, dtype=dtype, device=device
+                        ),
+                    ]
+                if (sq := temp_dict["add_k_proj"])[0] is not None:
+                    context_k_A, context_k_B = sq
+                else:
+                    context_k_A, context_k_B = [
+                        torch.zeros(
+                            expected_shape_add_qkv_a, dtype=dtype, device=device
+                        ),
+                        torch.zeros(
+                            expected_shape_add_qkv_b, dtype=dtype, device=device
+                        ),
+                    ]
+                if (sq := temp_dict["add_v_proj"])[0] is not None:
+                    context_v_A, context_v_B = sq
+                else:
+                    context_v_A, context_v_B = [
+                        torch.zeros(
+                            expected_shape_add_qkv_a, dtype=dtype, device=device
+                        ),
+                        torch.zeros(
+                            expected_shape_add_qkv_b, dtype=dtype, device=device
+                        ),
+                    ]
+                original_state_dict[f"double_blocks.{i}.txt_attn.qkv.lora_A.weight"] = (
+                    torch.cat([context_q_A, context_k_A, context_v_A], dim=0)
+                )
+                original_state_dict[f"double_blocks.{i}.txt_attn.qkv.lora_B.weight"] = (
+                    torch.cat([context_q_B, context_k_B, context_v_B], dim=0)
+                )
+        # qk_norm
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}{block_prefix}attn.norm_q.weight",
+            f"double_blocks.{i}.img_attn.norm.query_norm.scale",
+        )
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}{block_prefix}attn.norm_k.weight",
+            f"double_blocks.{i}.img_attn.norm.key_norm.scale",
+        )
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}{block_prefix}attn.norm_added_q.weight",
+            f"double_blocks.{i}.txt_attn.norm.query_norm.scale",
+        )
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}{block_prefix}attn.norm_added_k.weight",
+            f"double_blocks.{i}.txt_attn.norm.key_norm.scale",
+        )
+        # ff img_mlp
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}{block_prefix}ff.net.0.proj.weight",
+            f"double_blocks.{i}.img_mlp.0.weight",
+        )
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}{block_prefix}ff.net.2.weight",
+            f"double_blocks.{i}.img_mlp.2.weight",
+        )
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}{block_prefix}ff_context.net.0.proj.weight",
+            f"double_blocks.{i}.txt_mlp.0.weight",
+        )
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}{block_prefix}ff_context.net.2.weight",
+            f"double_blocks.{i}.txt_mlp.2.weight",
+        )
+        # output projections
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}{block_prefix}attn.to_out.0.weight",
+            f"double_blocks.{i}.img_attn.proj.weight",
+        )
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}{block_prefix}attn.to_add_out.weight",
+            f"double_blocks.{i}.txt_attn.proj.weight",
+        )
+    # single transformer blocks
+    for i in range(num_single_layers):
+        block_prefix = f"single_transformer_blocks.{i}."
+        # norm.linear -> single_blocks.0.modulation.lin
+        key_norm = f"{prefix}{block_prefix}norm.linear.weight"
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            key_norm,
+            f"single_blocks.{i}.modulation.lin.weight",
+        )
+        has_q, has_k, has_v, has_mlp = False, False, False, False
+        shape_qkv_a = None
+        shape_qkv_b = None
+        # Q, K, V, mlp
+        q_A = diffusers_state_dict.pop(f"{prefix}{block_prefix}attn.to_q.lora_A.weight")
+        q_B = diffusers_state_dict.pop(f"{prefix}{block_prefix}attn.to_q.lora_B.weight")
+        if q_A is not None and q_B is not None:
+            has_q = True
+            shape_qkv_a = q_A.shape
+            shape_qkv_b = q_B.shape
+        k_A = diffusers_state_dict.pop(f"{prefix}{block_prefix}attn.to_k.lora_A.weight")
+        k_B = diffusers_state_dict.pop(f"{prefix}{block_prefix}attn.to_k.lora_B.weight")
+        if k_A is not None and k_B is not None:
+            has_k = True
+            shape_qkv_a = k_A.shape
+            shape_qkv_b = k_B.shape
+        v_A = diffusers_state_dict.pop(f"{prefix}{block_prefix}attn.to_v.lora_A.weight")
+        v_B = diffusers_state_dict.pop(f"{prefix}{block_prefix}attn.to_v.lora_B.weight")
+        if v_A is not None and v_B is not None:
+            has_v = True
+            shape_qkv_a = v_A.shape
+            shape_qkv_b = v_B.shape
+        mlp_A = diffusers_state_dict.pop(
+            f"{prefix}{block_prefix}proj_mlp.lora_A.weight"
+        )
+        mlp_B = diffusers_state_dict.pop(
+            f"{prefix}{block_prefix}proj_mlp.lora_B.weight"
+        )
+        if mlp_A is not None and mlp_B is not None:
+            has_mlp = True
+            shape_qkv_a = mlp_A.shape
+            shape_qkv_b = mlp_B.shape
+        if any([has_q, has_k, has_v, has_mlp]):
+            if not has_q:
+                q_A, q_B = [
+                    torch.zeros(shape_qkv_a, dtype=dtype, device=device),
+                    torch.zeros(shape_qkv_b, dtype=dtype, device=device),
+                ]
+            if not has_k:
+                k_A, k_B = [
+                    torch.zeros(shape_qkv_a, dtype=dtype, device=device),
+                    torch.zeros(shape_qkv_b, dtype=dtype, device=device),
+                ]
+            if not has_v:
+                v_A, v_B = [
+                    torch.zeros(shape_qkv_a, dtype=dtype, device=device),
+                    torch.zeros(shape_qkv_b, dtype=dtype, device=device),
+                ]
+            if not has_mlp:
+                mlp_A, mlp_B = [
+                    torch.zeros(shape_qkv_a, dtype=dtype, device=device),
+                    torch.zeros(shape_qkv_b, dtype=dtype, device=device),
+                ]
+            original_state_dict[f"single_blocks.{i}.linear1.lora_A.weight"] = torch.cat(
+                [q_A, k_A, v_A, mlp_A], dim=0
+            )
+            original_state_dict[f"single_blocks.{i}.linear1.lora_B.weight"] = torch.cat(
+                [q_B, k_B, v_B, mlp_B], dim=0
+            )
+        # output projections
+        original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+            original_state_dict,
+            diffusers_state_dict,
+            f"{prefix}{block_prefix}proj_out.weight",
+            f"single_blocks.{i}.linear2.weight",
+        )
+    original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+        original_state_dict,
+        diffusers_state_dict,
+        f"{prefix}proj_out.weight",
+        "final_layer.linear.weight",
+    )
+    original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+        original_state_dict,
+        diffusers_state_dict,
+        f"{prefix}proj_out.bias",
+        "final_layer.linear.bias",
+    )
+    original_state_dict, diffusers_state_dict = convert_if_lora_exists(
+        original_state_dict,
+        diffusers_state_dict,
+        f"{prefix}norm_out.linear.weight",
+        "final_layer.adaLN_modulation.1.weight",
+    )
+    if len(list(diffusers_state_dict.keys())) > 0:
+        logger.warning("Unexpected keys:", diffusers_state_dict.keys())
+    return original_state_dict
+def convert_from_original_flux_checkpoint(original_state_dict: StateDict) -> StateDict:
+    """
+    Convert the state dict from the original Flux checkpoint format to the new format.
+    Args:
+        original_state_dict (Dict[str, torch.Tensor]): The original Flux checkpoint state dict.
+    Returns:
+        Dict[str, torch.Tensor]: The converted state dict in the new format.
+    """
+    sd = {
+        k.replace("lora_unet_", "")
+        .replace("double_blocks_", "double_blocks.")
+        .replace("single_blocks_", "single_blocks.")
+        .replace("_img_attn_", ".img_attn.")
+        .replace("_txt_attn_", ".txt_attn.")
+        .replace("_img_mod_", ".img_mod.")
+        .replace("_txt_mod_", ".txt_mod.")
+        .replace("_img_mlp_", ".img_mlp.")
+        .replace("_txt_mlp_", ".txt_mlp.")
+        .replace("_linear1", ".linear1")
+        .replace("_linear2", ".linear2")
+        .replace("_modulation_", ".modulation.")
+        .replace("lora_up", "lora_B")
+        .replace("lora_down", "lora_A"): v
+        for k, v in original_state_dict.items()
+        if "lora" in k
+    }
+    return sd
+def get_module_for_key(
+    key: str, model: Flux
+) -> F8Linear | torch.nn.Linear | CublasLinear:
+    parts = key.split(".")
+    module = model
+    for part in parts:
+        module = getattr(module, part)
+    return module
+def get_lora_for_key(
+    key: str, lora_weights: dict
+) -> Optional[Tuple[torch.Tensor, torch.Tensor, Optional[float]]]:
+    """
+    Get LoRA weights for a specific key.
+    Args:
+        key (str): The key to look up in the LoRA weights.
+        lora_weights (dict): Dictionary containing LoRA weights.
+    Returns:
+        Optional[Tuple[torch.Tensor, torch.Tensor, Optional[float]]]: A tuple containing lora_A, lora_B, and alpha if found, None otherwise.
+    """
+    prefix = key.split(".lora")[0]
+    lora_A = lora_weights.get(f"{prefix}.lora_A.weight")
+    lora_B = lora_weights.get(f"{prefix}.lora_B.weight")
+    alpha = lora_weights.get(f"{prefix}.alpha")
+    if lora_A is None or lora_B is None:
+        return None
+    return lora_A, lora_B, alpha
+def get_module_for_key(
+    key: str, model: Flux
+) -> F8Linear | torch.nn.Linear | CublasLinear:
+    parts = key.split(".")
+    module = model
+    for part in parts:
+        module = getattr(module, part)
+    return module
+def calculate_lora_weight(
+    lora_weights: Tuple[torch.Tensor, torch.Tensor, Union[torch.Tensor, float]],
+    rank: Optional[int] = None,
+    lora_scale: float = 1.0,
+    device: Optional[Union[torch.device, int, str]] = None,
+):
+    lora_A, lora_B, alpha = lora_weights
+    if device is None:
+        device = lora_A.device
+    uneven_rank = lora_B.shape[1] != lora_A.shape[0]
+    rank_diff = lora_A.shape[0] / lora_B.shape[1]
+    if rank is None:
+        rank = lora_B.shape[1]
+    if alpha is None:
+        alpha = rank
+    dtype = torch.float32
+    w_up = lora_A.to(dtype=dtype, device=device)
+    w_down = lora_B.to(dtype=dtype, device=device)
+    if alpha != rank:
+        w_up = w_up * alpha / rank
+    if uneven_rank:
+        # Fuse each lora instead of repeat interleave for each individual lora,
+        # seems to fuse more correctly.
+        fused_lora = torch.zeros(
+            (lora_B.shape[0], lora_A.shape[1]), device=device, dtype=dtype
+        )
+        w_up = w_up.chunk(int(rank_diff), dim=0)
+        for w_up_chunk in w_up:
+            fused_lora = fused_lora + (lora_scale * torch.mm(w_down, w_up_chunk))
+    else:
+        fused_lora = lora_scale * torch.mm(w_down, w_up)
+    return fused_lora
+@torch.inference_mode()
+def unfuse_lora_weight_from_module(
+    fused_weight: torch.Tensor,
+    lora_weights: dict,
+    rank: Optional[int] = None,
+    lora_scale: float = 1.0,
+):
+    w_dtype = fused_weight.dtype
+    dtype = torch.float32
+    device = fused_weight.device
+    fused_weight = fused_weight.to(dtype=dtype, device=device)
+    fused_lora = calculate_lora_weight(lora_weights, rank, lora_scale, device=device)
+    module_weight = fused_weight - fused_lora
+    return module_weight.to(dtype=w_dtype, device=device)
+@torch.inference_mode()
+def apply_lora_weight_to_module(
+    module_weight: torch.Tensor,
+    lora_weights: dict,
+    rank: int = None,
+    lora_scale: float = 1.0,
+):
+    w_dtype = module_weight.dtype
+    dtype = torch.float32
+    device = module_weight.device
+    fused_lora = calculate_lora_weight(lora_weights, rank, lora_scale, device=device)
+    fused_weight = module_weight.to(dtype=dtype) + fused_lora
+    return fused_weight.to(dtype=w_dtype, device=device)
+def resolve_lora_state_dict(lora_weights, has_guidance: bool = True):
+    check_if_starts_with_transformer = [
+        k for k in lora_weights.keys() if k.startswith("transformer.")
+    ]
+    if len(check_if_starts_with_transformer) > 0:
+        lora_weights = convert_diffusers_to_flux_transformer_checkpoint(
+            lora_weights, 19, 38, has_guidance=has_guidance, prefix="transformer."
+        )
+    else:
+        lora_weights = convert_from_original_flux_checkpoint(lora_weights)
+    logger.info("LoRA weights loaded")
+    logger.debug("Extracting keys")
+    keys_without_ab = list(
+        set(
+            [
+                key.replace(".lora_A.weight", "")
+                .replace(".lora_B.weight", "")
+                .replace(".lora_A", "")
+                .replace(".lora_B", "")
+                .replace(".alpha", "")
+                for key in lora_weights.keys()
+            ]
+        )
+    )
+    logger.debug("Keys extracted")
+    return keys_without_ab, lora_weights
+def get_lora_weights(lora_path: str | StateDict):
+    if isinstance(lora_path, (dict, LoraWeights)):
+        return lora_path, True
+    else:
+        return load_file(lora_path, "cpu"), False
+def extract_weight_from_linear(linear: Union[nn.Linear, CublasLinear, F8Linear]):
+    dtype = linear.weight.dtype
+    weight_is_f8 = False
+    if isinstance(linear, F8Linear):
+        weight_is_f8 = True
+        weight = (
+            linear.float8_data.clone()
+            .detach()
+            .float()
+            .mul(linear.scale_reciprocal)
+            .to(linear.weight.device)
+        )
+    elif isinstance(linear, torch.nn.Linear):
+        weight = linear.weight.clone().detach().float()
+    elif isinstance(linear, CublasLinear) and CublasLinear != type(None):
+        weight = linear.weight.clone().detach().float()
+    return weight, weight_is_f8, dtype
+@torch.inference_mode()
+def apply_lora_to_model(
+    model: Flux,
+    lora_path: str | StateDict,
+    lora_scale: float = 1.0,
+    return_lora_resolved: bool = False,
+) -> Flux:
+    has_guidance = model.params.guidance_embed
+    logger.info(f"Loading LoRA weights for {lora_path}")
+    lora_weights, already_loaded = get_lora_weights(lora_path)
+    if not already_loaded:
+        keys_without_ab, lora_weights = resolve_lora_state_dict(
+            lora_weights, has_guidance
+        )
+    elif isinstance(lora_weights, LoraWeights):
+        b_ = lora_weights
+        lora_weights = b_.weights
+        keys_without_ab = list(
+            set(
+                [
+                    key.replace(".lora_A.weight", "")
+                    .replace(".lora_B.weight", "")
+                    .replace(".lora_A", "")
+                    .replace(".lora_B", "")
+                    .replace(".alpha", "")
+                    for key in lora_weights.keys()
+                ]
+            )
+        )
+    else:
+        lora_weights = lora_weights
+        keys_without_ab = list(
+            set(
+                [
+                    key.replace(".lora_A.weight", "")
+                    .replace(".lora_B.weight", "")
+                    .replace(".lora_A", "")
+                    .replace(".lora_B", "")
+                    .replace(".alpha", "")
+                    for key in lora_weights.keys()
+                ]
+            )
+        )
+    for key in tqdm(keys_without_ab, desc="Applying LoRA", total=len(keys_without_ab)):
+        module = get_module_for_key(key, model)
+        weight, is_f8, dtype = extract_weight_from_linear(module)
+        lora_sd = get_lora_for_key(key, lora_weights)
+        if lora_sd is None:
+            # Skipping LoRA application for this module
+            continue
+        weight = apply_lora_weight_to_module(weight, lora_sd, lora_scale=lora_scale)
+        if is_f8:
+            module.set_weight_tensor(weight.type(dtype))
+        else:
+            module.weight.data = weight.type(dtype)
+    logger.success("Lora applied")
+    if return_lora_resolved:
+        return model, lora_weights
+    return model
+def remove_lora_from_module(
+    model: Flux,
+    lora_path: str | StateDict,
+    lora_scale: float = 1.0,
+):
+    has_guidance = model.params.guidance_embed
+    logger.info(f"Loading LoRA weights for {lora_path}")
+    lora_weights, already_loaded = get_lora_weights(lora_path)
+    if not already_loaded:
+        keys_without_ab, lora_weights = resolve_lora_state_dict(
+            lora_weights, has_guidance
+        )
+    elif isinstance(lora_weights, LoraWeights):
+        b_ = lora_weights
+        lora_weights = b_.weights
+        keys_without_ab = list(
+            set(
+                [
+                    key.replace(".lora_A.weight", "")
+                    .replace(".lora_B.weight", "")
+                    .replace(".lora_A", "")
+                    .replace(".lora_B", "")
+                    .replace(".alpha", "")
+                    for key in lora_weights.keys()
+                ]
+            )
+        )
+        lora_scale = b_.scale
+    else:
+        lora_weights = lora_weights
+        keys_without_ab = list(
+            set(
+                [
+                    key.replace(".lora_A.weight", "")
+                    .replace(".lora_B.weight", "")
+                    .replace(".lora_A", "")
+                    .replace(".lora_B", "")
+                    .replace(".alpha", "")
+                    for key in lora_weights.keys()
+                ]
+            )
+        )
+    for key in tqdm(keys_without_ab, desc="Unfusing LoRA", total=len(keys_without_ab)):
+        module = get_module_for_key(key, model)
+        weight, is_f8, dtype = extract_weight_from_linear(module)
+        lora_sd = get_lora_for_key(key, lora_weights)
+        if lora_sd is None:
+            # Skipping LoRA application for this module
+            continue
+        weight = unfuse_lora_weight_from_module(weight, lora_sd, lora_scale=lora_scale)
+        if is_f8:
+            module.set_weight_tensor(weight.type(dtype))
+        else:
+            module.weight.data = weight.type(dtype)
+    logger.success("Lora unfused")
+    return model

main.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import argparse
+import uvicorn
+from api import app
+def parse_args():
+    parser = argparse.ArgumentParser(description="Launch Flux API server")
+    parser.add_argument(
+        "-c",
+        "--config-path",
+        type=str,
+        help="Path to the configuration file, if not provided, the model will be loaded from the command line arguments",
+    )
+    parser.add_argument(
+        "-p",
+        "--port",
+        type=int,
+        default=8088,
+        help="Port to run the server on",
+    )
+    parser.add_argument(
+        "-H",
+        "--host",
+        type=str,
+        default="0.0.0.0",
+        help="Host to run the server on",
+    )
+    parser.add_argument(
+        "-f", "--flow-model-path", type=str, help="Path to the flow model"
+    )
+    parser.add_argument(
+        "-t", "--text-enc-path", type=str, help="Path to the text encoder"
+    )
+    parser.add_argument(
+        "-a", "--autoencoder-path", type=str, help="Path to the autoencoder"
+    )
+    parser.add_argument(
+        "-m",
+        "--model-version",
+        type=str,
+        choices=["flux-dev", "flux-schnell"],
+        default="flux-dev",
+        help="Choose model version",
+    )
+    parser.add_argument(
+        "-F",
+        "--flux-device",
+        type=str,
+        default="cuda:0",
+        help="Device to run the flow model on",
+    )
+    parser.add_argument(
+        "-T",
+        "--text-enc-device",
+        type=str,
+        default="cuda:0",
+        help="Device to run the text encoder on",
+    )
+    parser.add_argument(
+        "-A",
+        "--autoencoder-device",
+        type=str,
+        default="cuda:0",
+        help="Device to run the autoencoder on",
+    )
+    parser.add_argument(
+        "-q",
+        "--num-to-quant",
+        type=int,
+        default=20,
+        help="Number of linear layers in flow transformer (the 'unet') to quantize",
+    )
+    parser.add_argument(
+        "-C",
+        "--compile",
+        action="store_true",
+        default=False,
+        help="Compile the flow model with extra optimizations",
+    )
+    parser.add_argument(
+        "-qT",
+        "--quant-text-enc",
+        type=str,
+        default="qfloat8",
+        choices=["qint4", "qfloat8", "qint2", "qint8", "bf16"],
+        help="Quantize the t5 text encoder to the given dtype, if bf16, will not quantize",
+        dest="quant_text_enc",
+    )
+    parser.add_argument(
+        "-qA",
+        "--quant-ae",
+        action="store_true",
+        default=False,
+        help="Quantize the autoencoder with float8 linear layers, otherwise will use bfloat16",
+        dest="quant_ae",
+    )
+    parser.add_argument(
+        "-OF",
+        "--offload-flow",
+        action="store_true",
+        default=False,
+        dest="offload_flow",
+        help="Offload the flow model to the CPU when not being used to save memory",
+    )
+    parser.add_argument(
+        "-OA",
+        "--no-offload-ae",
+        action="store_false",
+        default=True,
+        dest="offload_ae",
+        help="Disable offloading the autoencoder to the CPU when not being used to increase e2e inference speed",
+    )
+    parser.add_argument(
+        "-OT",
+        "--no-offload-text-enc",
+        action="store_false",
+        default=True,
+        dest="offload_text_enc",
+        help="Disable offloading the text encoder to the CPU when not being used to increase e2e inference speed",
+    )
+    parser.add_argument(
+        "-PF",
+        "--prequantized-flow",
+        action="store_true",
+        default=False,
+        dest="prequantized_flow",
+        help="Load the flow model from a prequantized checkpoint "
+        + "(requires loading the flow model, running a minimum of 24 steps, "
+        + "and then saving the state_dict as a safetensors file), "
+        + "which reduces the size of the checkpoint by about 50% & reduces startup time",
+    )
+    parser.add_argument(
+        "-nqfm",
+        "--no-quantize-flow-modulation",
+        action="store_false",
+        default=True,
+        dest="quantize_modulation",
+        help="Disable quantization of the modulation layers in the flow model, adds ~2GB vram usage for moderate precision improvements",
+    )
+    parser.add_argument(
+        "-qfl",
+        "--quantize-flow-embedder-layers",
+        action="store_true",
+        default=False,
+        dest="quantize_flow_embedder_layers",
+        help="Quantize the flow embedder layers in the flow model, saves ~512MB vram usage, but precision loss is very noticeable",
+    )
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    # lazy loading so cli returns fast instead of waiting for torch to load modules
+    from flux_pipeline import FluxPipeline
+    from util import load_config, ModelVersion
+    if args.config_path:
+        app.state.model = FluxPipeline.load_pipeline_from_config_path(
+            args.config_path, flow_model_path=args.flow_model_path
+        )
+    else:
+        model_version = (
+            ModelVersion.flux_dev
+            if args.model_version == "flux-dev"
+            else ModelVersion.flux_schnell
+        )
+        config = load_config(
+            model_version,
+            flux_path=args.flow_model_path,
+            flux_device=args.flux_device,
+            ae_path=args.autoencoder_path,
+            ae_device=args.autoencoder_device,
+            text_enc_path=args.text_enc_path,
+            text_enc_device=args.text_enc_device,
+            flow_dtype="float16",
+            text_enc_dtype="bfloat16",
+            ae_dtype="bfloat16",
+            num_to_quant=args.num_to_quant,
+            compile_extras=args.compile,
+            compile_blocks=args.compile,
+            quant_text_enc=(
+                None if args.quant_text_enc == "bf16" else args.quant_text_enc
+            ),
+            quant_ae=args.quant_ae,
+            offload_flow=args.offload_flow,
+            offload_ae=args.offload_ae,
+            offload_text_enc=args.offload_text_enc,
+            prequantized_flow=args.prequantized_flow,
+            quantize_modulation=args.quantize_modulation,
+            quantize_flow_embedder_layers=args.quantize_flow_embedder_layers,
+        )
+        app.state.model = FluxPipeline.load_pipeline_from_config(config)
+    uvicorn.run(app, host=args.host, port=args.port)
+if __name__ == "__main__":
+    main()

modules/autoencoder.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import torch
+from einops import rearrange
+from torch import Tensor, nn
+from pydantic import BaseModel
+class AutoEncoderParams(BaseModel):
+    resolution: int
+    in_channels: int
+    ch: int
+    out_ch: int
+    ch_mult: list[int]
+    num_res_blocks: int
+    z_channels: int
+    scale_factor: float
+    shift_factor: float
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.conv1 = nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        self.norm2 = nn.GroupNorm(
+            num_groups=32, num_channels=out_channels, eps=1e-6, affine=True
+        )
+        self.conv2 = nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0
+            )
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=2, padding=0
+        )
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(
+            num_groups=32, num_channels=block_in, eps=1e-6, affine=True
+        )
+        self.conv_out = nn.Conv2d(
+            block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(
+            num_groups=32, num_channels=block_in, eps=1e-6, affine=True
+        )
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))

modules/conditioner.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os
+import torch
+from torch import Tensor, nn
+from transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5Tokenizer,
+    __version__,
+)
+from transformers.utils.quantization_config import QuantoConfig, BitsAndBytesConfig
+CACHE_DIR = os.environ.get("HF_HOME", "~/.cache/huggingface")
+def auto_quantization_config(
+    quantization_dtype: str,
+) -> QuantoConfig | BitsAndBytesConfig:
+    if quantization_dtype == "qfloat8":
+        return QuantoConfig(weights="float8")
+    elif quantization_dtype == "qint4":
+        return BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_quant_type="nf4",
+        )
+    elif quantization_dtype == "qint8":
+        return BitsAndBytesConfig(load_in_8bit=True, llm_int8_has_fp16_weight=False)
+    elif quantization_dtype == "qint2":
+        return QuantoConfig(weights="int2")
+    elif quantization_dtype is None or quantization_dtype == "bfloat16":
+        return None
+    else:
+        raise ValueError(f"Unsupported quantization dtype: {quantization_dtype}")
+class HFEmbedder(nn.Module):
+    def __init__(
+        self,
+        version: str,
+        max_length: int,
+        device: torch.device | int,
+        quantization_dtype: str | None = None,
+        offloading_device: torch.device | int | None = torch.device("cpu"),
+        is_clip: bool = False,
+        **hf_kwargs,
+    ):
+        super().__init__()
+        self.offloading_device = (
+            offloading_device
+            if isinstance(offloading_device, torch.device)
+            else torch.device(offloading_device)
+        )
+        self.device = (
+            device if isinstance(device, torch.device) else torch.device(device)
+        )
+        self.is_clip = version.startswith("openai") or is_clip
+        self.max_length = max_length
+        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        auto_quant_config = (
+            auto_quantization_config(quantization_dtype)
+            if quantization_dtype is not None
+            and quantization_dtype != "bfloat16"
+            and quantization_dtype != "float16"
+            else None
+        )
+        # BNB will move to cuda:0 by default if not specified
+        if isinstance(auto_quant_config, BitsAndBytesConfig):
+            hf_kwargs["device_map"] = {"": self.device.index}
+        if auto_quant_config is not None:
+            hf_kwargs["quantization_config"] = auto_quant_config
+        if self.is_clip:
+            self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
+                version, max_length=max_length
+            )
+            self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(
+                version,
+                **hf_kwargs,
+            )
+        else:
+            self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(
+                version, max_length=max_length
+            )
+            self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
+                version,
+                **hf_kwargs,
+            )
+    def offload(self):
+        self.hf_module.to(device=self.offloading_device)
+        torch.cuda.empty_cache()
+    def cuda(self):
+        self.hf_module.to(device=self.device)
+    def forward(self, text: list[str]) -> Tensor:
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        outputs = self.hf_module(
+            input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        return outputs[self.output_key]
+if __name__ == "__main__":
+    model = HFEmbedder(
+        "city96/t5-v1_1-xxl-encoder-bf16",
+        max_length=512,
+        device=0,
+        quantization_dtype="qfloat8",
+    )
+    o = model(["hello"])
+    print(o)

modules/flux_model.py ADDED Viewed

	@@ -0,0 +1,734 @@

+import os
+from collections import namedtuple
+from typing import TYPE_CHECKING, List
+import torch
+from loguru import logger
+if TYPE_CHECKING:
+    from lora_loading import LoraWeights
+    from util import ModelSpec
+DISABLE_COMPILE = os.getenv("DISABLE_COMPILE", "0") == "1"
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.backends.cudnn.benchmark = True
+torch.backends.cudnn.benchmark_limit = 20
+torch.set_float32_matmul_precision("high")
+import math
+from pydantic import BaseModel
+from torch import Tensor, nn
+from torch.nn import functional as F
+class FluxParams(BaseModel):
+    in_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+# attention is always same shape each time it's called per H*W, so compile with fullgraph
+# @torch.compile(mode="reduce-overhead", fullgraph=True, disable=DISABLE_COMPILE)
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    x = F.scaled_dot_product_attention(q, k, v).transpose(1, 2)
+    x = x.reshape(*x.shape[:-2], -1)
+    return x
+# @torch.compile(mode="reduce-overhead", disable=DISABLE_COMPILE)
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    scale = torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack(
+        [torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1
+    )
+    out = out.reshape(*out.shape[:-1], 2, 2)
+    return out
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape), xk_out.reshape(*xk.shape)
+class EmbedND(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        theta: int,
+        axes_dim: list[int],
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+        self.dtype = dtype
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [
+                rope(ids[..., i], self.axes_dim[i], self.theta).type(self.dtype)
+                for i in range(n_axes)
+            ],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device)
+        / half
+    )
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+class MLPEmbedder(nn.Module):
+    def __init__(
+        self, in_dim: int, hidden_dim: int, prequantized: bool = False, quantized=False
+    ):
+        from float8_quantize import F8Linear
+        super().__init__()
+        self.in_layer = (
+            nn.Linear(in_dim, hidden_dim, bias=True)
+            if not prequantized
+            else (
+                F8Linear(
+                    in_features=in_dim,
+                    out_features=hidden_dim,
+                    bias=True,
+                )
+                if quantized
+                else nn.Linear(in_dim, hidden_dim, bias=True)
+            )
+        )
+        self.silu = nn.SiLU()
+        self.out_layer = (
+            nn.Linear(hidden_dim, hidden_dim, bias=True)
+            if not prequantized
+            else (
+                F8Linear(
+                    in_features=hidden_dim,
+                    out_features=hidden_dim,
+                    bias=True,
+                )
+                if quantized
+                else nn.Linear(hidden_dim, hidden_dim, bias=True)
+            )
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x: Tensor):
+        return F.rms_norm(x.float(), self.scale.shape, self.scale, eps=1e-6).to(x)
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q, k
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        prequantized: bool = False,
+    ):
+        super().__init__()
+        from float8_quantize import F8Linear
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = (
+            nn.Linear(dim, dim * 3, bias=qkv_bias)
+            if not prequantized
+            else F8Linear(
+                in_features=dim,
+                out_features=dim * 3,
+                bias=qkv_bias,
+            )
+        )
+        self.norm = QKNorm(head_dim)
+        self.proj = (
+            nn.Linear(dim, dim)
+            if not prequantized
+            else F8Linear(
+                in_features=dim,
+                out_features=dim,
+                bias=True,
+            )
+        )
+        self.K = 3
+        self.H = self.num_heads
+        self.KH = self.K * self.H
+    def rearrange_for_norm(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
+        B, L, D = x.shape
+        q, k, v = x.reshape(B, L, self.K, self.H, D // self.KH).permute(2, 0, 3, 1, 4)
+        return q, k, v
+    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = self.rearrange_for_norm(qkv)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+ModulationOut = namedtuple("ModulationOut", ["shift", "scale", "gate"])
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool, quantized_modulation: bool = False):
+        super().__init__()
+        from float8_quantize import F8Linear
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = (
+            nn.Linear(dim, self.multiplier * dim, bias=True)
+            if not quantized_modulation
+            else F8Linear(
+                in_features=dim,
+                out_features=self.multiplier * dim,
+                bias=True,
+            )
+        )
+        self.act = nn.SiLU()
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(self.act(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class DoubleStreamBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float,
+        qkv_bias: bool = False,
+        dtype: torch.dtype = torch.float16,
+        quantized_modulation: bool = False,
+        prequantized: bool = False,
+    ):
+        super().__init__()
+        from float8_quantize import F8Linear
+        self.dtype = dtype
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(
+            hidden_size, double=True, quantized_modulation=quantized_modulation
+        )
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(
+            dim=hidden_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            prequantized=prequantized,
+        )
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            (
+                nn.Linear(hidden_size, mlp_hidden_dim, bias=True)
+                if not prequantized
+                else F8Linear(
+                    in_features=hidden_size,
+                    out_features=mlp_hidden_dim,
+                    bias=True,
+                )
+            ),
+            nn.GELU(approximate="tanh"),
+            (
+                nn.Linear(mlp_hidden_dim, hidden_size, bias=True)
+                if not prequantized
+                else F8Linear(
+                    in_features=mlp_hidden_dim,
+                    out_features=hidden_size,
+                    bias=True,
+                )
+            ),
+        )
+        self.txt_mod = Modulation(
+            hidden_size, double=True, quantized_modulation=quantized_modulation
+        )
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(
+            dim=hidden_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            prequantized=prequantized,
+        )
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            (
+                nn.Linear(hidden_size, mlp_hidden_dim, bias=True)
+                if not prequantized
+                else F8Linear(
+                    in_features=hidden_size,
+                    out_features=mlp_hidden_dim,
+                    bias=True,
+                )
+            ),
+            nn.GELU(approximate="tanh"),
+            (
+                nn.Linear(mlp_hidden_dim, hidden_size, bias=True)
+                if not prequantized
+                else F8Linear(
+                    in_features=mlp_hidden_dim,
+                    out_features=hidden_size,
+                    bias=True,
+                )
+            ),
+        )
+        self.K = 3
+        self.H = self.num_heads
+        self.KH = self.K * self.H
+        self.do_clamp = dtype == torch.float16
+    def rearrange_for_norm(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
+        B, L, D = x.shape
+        q, k, v = x.reshape(B, L, self.K, self.H, D // self.KH).permute(2, 0, 3, 1, 4)
+        return q, k, v
+    def forward(
+        self,
+        img: Tensor,
+        txt: Tensor,
+        vec: Tensor,
+        pe: Tensor,
+    ) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = self.rearrange_for_norm(img_qkv)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = self.rearrange_for_norm(txt_qkv)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        attn = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp(
+            (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
+        )
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp(
+            (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
+        )
+        if self.do_clamp:
+            img = img.clamp(min=-32000, max=32000)
+            txt = txt.clamp(min=-32000, max=32000)
+        return img, txt
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+        dtype: torch.dtype = torch.float16,
+        quantized_modulation: bool = False,
+        prequantized: bool = False,
+    ):
+        super().__init__()
+        from float8_quantize import F8Linear
+        self.dtype = dtype
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = (
+            nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+            if not prequantized
+            else F8Linear(
+                in_features=hidden_size,
+                out_features=hidden_size * 3 + self.mlp_hidden_dim,
+                bias=True,
+            )
+        )
+        # proj and mlp_out
+        self.linear2 = (
+            nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+            if not prequantized
+            else F8Linear(
+                in_features=hidden_size + self.mlp_hidden_dim,
+                out_features=hidden_size,
+                bias=True,
+            )
+        )
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(
+            hidden_size,
+            double=False,
+            quantized_modulation=quantized_modulation and prequantized,
+        )
+        self.K = 3
+        self.H = self.num_heads
+        self.KH = self.K * self.H
+        self.do_clamp = dtype == torch.float16
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod = self.modulation(vec)[0]
+        pre_norm = self.pre_norm(x)
+        x_mod = (1 + mod.scale) * pre_norm + mod.shift
+        qkv, mlp = torch.split(
+            self.linear1(x_mod),
+            [3 * self.hidden_size, self.mlp_hidden_dim],
+            dim=-1,
+        )
+        B, L, D = qkv.shape
+        q, k, v = qkv.reshape(B, L, self.K, self.H, D // self.KH).permute(2, 0, 3, 1, 4)
+        q, k = self.norm(q, k, v)
+        attn = attention(q, k, v, pe=pe)
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        if self.do_clamp:
+            out = (x + mod.gate * output).clamp(min=-32000, max=32000)
+        else:
+            out = x + mod.gate * output
+        return out
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(
+            hidden_size, patch_size * patch_size * out_channels, bias=True
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
+class Flux(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+    def __init__(self, config: "ModelSpec", dtype: torch.dtype = torch.float16):
+        super().__init__()
+        self.dtype = dtype
+        self.params = config.params
+        self.in_channels = config.params.in_channels
+        self.out_channels = self.in_channels
+        self.loras: List[LoraWeights] = []
+        prequantized_flow = config.prequantized_flow
+        quantized_embedders = config.quantize_flow_embedder_layers and prequantized_flow
+        quantized_modulation = config.quantize_modulation and prequantized_flow
+        from float8_quantize import F8Linear
+        if config.params.hidden_size % config.params.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {config.params.hidden_size} must be divisible by num_heads {config.params.num_heads}"
+            )
+        pe_dim = config.params.hidden_size // config.params.num_heads
+        if sum(config.params.axes_dim) != pe_dim:
+            raise ValueError(
+                f"Got {config.params.axes_dim} but expected positional dim {pe_dim}"
+            )
+        self.hidden_size = config.params.hidden_size
+        self.num_heads = config.params.num_heads
+        self.pe_embedder = EmbedND(
+            dim=pe_dim,
+            theta=config.params.theta,
+            axes_dim=config.params.axes_dim,
+            dtype=self.dtype,
+        )
+        self.img_in = (
+            nn.Linear(self.in_channels, self.hidden_size, bias=True)
+            if not prequantized_flow
+            else (
+                F8Linear(
+                    in_features=self.in_channels,
+                    out_features=self.hidden_size,
+                    bias=True,
+                )
+                if quantized_embedders
+                else nn.Linear(self.in_channels, self.hidden_size, bias=True)
+            )
+        )
+        self.time_in = MLPEmbedder(
+            in_dim=256,
+            hidden_dim=self.hidden_size,
+            prequantized=prequantized_flow,
+            quantized=quantized_embedders,
+        )
+        self.vector_in = MLPEmbedder(
+            config.params.vec_in_dim,
+            self.hidden_size,
+            prequantized=prequantized_flow,
+            quantized=quantized_embedders,
+        )
+        self.guidance_in = (
+            MLPEmbedder(
+                in_dim=256,
+                hidden_dim=self.hidden_size,
+                prequantized=prequantized_flow,
+                quantized=quantized_embedders,
+            )
+            if config.params.guidance_embed
+            else nn.Identity()
+        )
+        self.txt_in = (
+            nn.Linear(config.params.context_in_dim, self.hidden_size)
+            if not quantized_embedders
+            else (
+                F8Linear(
+                    in_features=config.params.context_in_dim,
+                    out_features=self.hidden_size,
+                    bias=True,
+                )
+                if quantized_embedders
+                else nn.Linear(config.params.context_in_dim, self.hidden_size)
+            )
+        )
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=config.params.mlp_ratio,
+                    qkv_bias=config.params.qkv_bias,
+                    dtype=self.dtype,
+                    quantized_modulation=quantized_modulation,
+                    prequantized=prequantized_flow,
+                )
+                for _ in range(config.params.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=config.params.mlp_ratio,
+                    dtype=self.dtype,
+                    quantized_modulation=quantized_modulation,
+                    prequantized=prequantized_flow,
+                )
+                for _ in range(config.params.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+    def get_lora(self, identifier: str):
+        for lora in self.loras:
+            if lora.path == identifier or lora.name == identifier:
+                return lora
+    def has_lora(self, identifier: str):
+        for lora in self.loras:
+            if lora.path == identifier or lora.name == identifier:
+                return True
+    def load_lora(self, path: str, scale: float, name: str = None):
+        from lora_loading import (
+            LoraWeights,
+            apply_lora_to_model,
+            remove_lora_from_module,
+        )
+        if self.has_lora(path):
+            lora = self.get_lora(path)
+            if lora.scale == scale:
+                logger.warning(
+                    f"Lora {lora.name} already loaded with same scale - ignoring!"
+                )
+            else:
+                remove_lora_from_module(self, lora, lora.scale)
+                apply_lora_to_model(self, lora, scale)
+                for idx, lora_ in enumerate(self.loras):
+                    if lora_.path == lora.path:
+                        self.loras[idx].scale = scale
+                        break
+        else:
+            _, lora = apply_lora_to_model(self, path, scale, return_lora_resolved=True)
+            self.loras.append(LoraWeights(lora, path, name, scale))
+    def unload_lora(self, path_or_identifier: str):
+        from lora_loading import remove_lora_from_module
+        removed = False
+        for idx, lora_ in enumerate(list(self.loras)):
+            if lora_.path == path_or_identifier or lora_.name == path_or_identifier:
+                remove_lora_from_module(self, lora_.weights, lora_.scale)
+                self.loras.pop(idx)
+                removed = True
+                break
+        if not removed:
+            logger.warning(
+                f"Couldn't remove lora {path_or_identifier} as it wasn't found fused to the model!"
+            )
+        else:
+            logger.info("Successfully removed lora from module.")
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256).type(self.dtype))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError(
+                    "Didn't get guidance strength for guidance distilled model."
+                )
+            vec = vec + self.guidance_in(
+                timestep_embedding(guidance, 256).type(self.dtype)
+            )
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        # double stream blocks
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+        img = torch.cat((txt, img), 1)
+        # single stream blocks
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img
+    @classmethod
+    def from_pretrained(
+        cls: "Flux", path: str, dtype: torch.dtype = torch.float16
+    ) -> "Flux":
+        from safetensors.torch import load_file
+        from util import load_config_from_path
+        config = load_config_from_path(path)
+        with torch.device("meta"):
+            klass = cls(config=config, dtype=dtype)
+            if not config.prequantized_flow:
+                klass.type(dtype)
+        ckpt = load_file(config.ckpt_path, device="cpu")
+        klass.load_state_dict(ckpt, assign=True)
+        return klass.to("cpu")

photo.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8476d6d93124c5265bb9ff0b393600b7ca26b3a566822c036cd9f59141065a9b
+size 174924704

start.py ADDED Viewed

File without changes

util.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import json
+from pathlib import Path
+from typing import Literal, Optional
+import torch
+from modules.autoencoder import AutoEncoder, AutoEncoderParams
+from modules.conditioner import HFEmbedder
+from modules.flux_model import Flux, FluxParams
+from safetensors.torch import load_file as load_sft
+try:
+    from enum import StrEnum
+except:
+    from enum import Enum
+    class StrEnum(str, Enum):
+        pass
+from pydantic import BaseModel, ConfigDict
+from loguru import logger
+class ModelVersion(StrEnum):
+    flux_dev = "flux-dev"
+    flux_schnell = "flux-schnell"
+class QuantizationDtype(StrEnum):
+    qfloat8 = "qfloat8"
+    qint2 = "qint2"
+    qint4 = "qint4"
+    qint8 = "qint8"
+    bfloat16 = "bfloat16"
+    float16 = "float16"
+class ModelSpec(BaseModel):
+    version: ModelVersion
+    params: FluxParams
+    ae_params: AutoEncoderParams
+    ckpt_path: str | None
+    # Add option to pass in custom clip model
+    clip_path: str | None = "openai/clip-vit-large-patch14"
+    ae_path: str | None
+    repo_id: str | None
+    repo_flow: str | None
+    repo_ae: str | None
+    text_enc_max_length: int = 512
+    text_enc_path: str | None
+    text_enc_device: str | torch.device | None = "cuda:0"
+    ae_device: str | torch.device | None = "cuda:0"
+    flux_device: str | torch.device | None = "cuda:0"
+    flow_dtype: str = "float16"
+    ae_dtype: str = "bfloat16"
+    text_enc_dtype: str = "bfloat16"
+    # unused / deprecated
+    num_to_quant: Optional[int] = 20
+    quantize_extras: bool = False
+    compile_extras: bool = False
+    compile_blocks: bool = False
+    flow_quantization_dtype: Optional[QuantizationDtype] = QuantizationDtype.qfloat8
+    text_enc_quantization_dtype: Optional[QuantizationDtype] = QuantizationDtype.qfloat8
+    ae_quantization_dtype: Optional[QuantizationDtype] = None
+    clip_quantization_dtype: Optional[QuantizationDtype] = None
+    offload_text_encoder: bool = False
+    offload_vae: bool = False
+    offload_flow: bool = False
+    prequantized_flow: bool = False
+    # Improved precision via not quanitzing the modulation linear layers
+    quantize_modulation: bool = True
+    # Improved precision via not quanitzing the flow embedder layers
+    quantize_flow_embedder_layers: bool = False
+    model_config: ConfigDict = {
+        "arbitrary_types_allowed": True,
+        "use_enum_values": True,
+    }
+def load_models(config: ModelSpec) -> tuple[Flux, AutoEncoder, HFEmbedder, HFEmbedder]:
+    flow = load_flow_model(config)
+    ae = load_autoencoder(config)
+    clip, t5 = load_text_encoders(config)
+    return flow, ae, clip, t5
+def parse_device(device: str | torch.device | None) -> torch.device:
+    if isinstance(device, str):
+        return torch.device(device)
+    elif isinstance(device, torch.device):
+        return device
+    else:
+        return torch.device("cuda:0")
+def into_dtype(dtype: str) -> torch.dtype:
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if dtype == "float16":
+        return torch.float16
+    elif dtype == "bfloat16":
+        return torch.bfloat16
+    elif dtype == "float32":
+        return torch.float32
+    else:
+        raise ValueError(f"Invalid dtype: {dtype}")
+def into_device(device: str | torch.device | None) -> torch.device:
+    if isinstance(device, str):
+        return torch.device(device)
+    elif isinstance(device, torch.device):
+        return device
+    elif isinstance(device, int):
+        return torch.device(f"cuda:{device}")
+    else:
+        return torch.device("cuda:0")
+def load_config(
+    name: ModelVersion = ModelVersion.flux_dev,
+    flux_path: str | None = None,
+    ae_path: str | None = None,
+    text_enc_path: str | None = None,
+    text_enc_device: str | torch.device | None = None,
+    ae_device: str | torch.device | None = None,
+    flux_device: str | torch.device | None = None,
+    flow_dtype: str = "float16",
+    ae_dtype: str = "bfloat16",
+    text_enc_dtype: str = "bfloat16",
+    num_to_quant: Optional[int] = 20,
+    compile_extras: bool = False,
+    compile_blocks: bool = False,
+    offload_text_enc: bool = False,
+    offload_ae: bool = False,
+    offload_flow: bool = False,
+    quant_text_enc: Optional[Literal["float8", "qint2", "qint4", "qint8"]] = None,
+    quant_ae: bool = False,
+    prequantized_flow: bool = False,
+    quantize_modulation: bool = True,
+    quantize_flow_embedder_layers: bool = False,
+) -> ModelSpec:
+    """
+    Load a model configuration using the passed arguments.
+    """
+    text_enc_device = str(parse_device(text_enc_device))
+    ae_device = str(parse_device(ae_device))
+    flux_device = str(parse_device(flux_device))
+    return ModelSpec(
+        version=name,
+        repo_id=(
+            "black-forest-labs/FLUX.1-dev"
+            if name == ModelVersion.flux_dev
+            else "black-forest-labs/FLUX.1-schnell"
+        ),
+        repo_flow=(
+            "flux1-dev.sft" if name == ModelVersion.flux_dev else "flux1-schnell.sft"
+        ),
+        repo_ae="ae.sft",
+        ckpt_path=flux_path,
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=name == ModelVersion.flux_dev,
+        ),
+        ae_path=ae_path,
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+        text_enc_path=text_enc_path,
+        text_enc_device=text_enc_device,
+        ae_device=ae_device,
+        flux_device=flux_device,
+        flow_dtype=flow_dtype,
+        ae_dtype=ae_dtype,
+        text_enc_dtype=text_enc_dtype,
+        text_enc_max_length=512 if name == ModelVersion.flux_dev else 256,
+        num_to_quant=num_to_quant,
+        compile_extras=compile_extras,
+        compile_blocks=compile_blocks,
+        offload_flow=offload_flow,
+        offload_text_encoder=offload_text_enc,
+        offload_vae=offload_ae,
+        text_enc_quantization_dtype={
+            "float8": QuantizationDtype.qfloat8,
+            "qint2": QuantizationDtype.qint2,
+            "qint4": QuantizationDtype.qint4,
+            "qint8": QuantizationDtype.qint8,
+        }.get(quant_text_enc, None),
+        ae_quantization_dtype=QuantizationDtype.qfloat8 if quant_ae else None,
+        prequantized_flow=prequantized_flow,
+        quantize_modulation=quantize_modulation,
+        quantize_flow_embedder_layers=quantize_flow_embedder_layers,
+    )
+def load_config_from_path(path: str) -> ModelSpec:
+    path_path = Path(path)
+    if not path_path.exists():
+        raise ValueError(f"Path {path} does not exist")
+    if not path_path.is_file():
+        raise ValueError(f"Path {path} is not a file")
+    return ModelSpec(**json.loads(path_path.read_text()))
+def print_load_warning(missing: list[str], unexpected: list[str]) -> None:
+    if len(missing) > 0 and len(unexpected) > 0:
+        logger.warning(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+        logger.warning("\n" + "-" * 79 + "\n")
+        logger.warning(
+            f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected)
+        )
+    elif len(missing) > 0:
+        logger.warning(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+    elif len(unexpected) > 0:
+        logger.warning(
+            f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected)
+        )
+def load_flow_model(config: ModelSpec) -> Flux:
+    ckpt_path = config.ckpt_path
+    FluxClass = Flux
+    with torch.device("meta"):
+        model = FluxClass(config, dtype=into_dtype(config.flow_dtype))
+        if not config.prequantized_flow:
+            model.type(into_dtype(config.flow_dtype))
+    if ckpt_path is not None:
+        # load_sft doesn't support torch.device
+        sd = load_sft(ckpt_path, device="cpu")
+        missing, unexpected = model.load_state_dict(sd, strict=False, assign=True)
+        print_load_warning(missing, unexpected)
+        if not config.prequantized_flow:
+            model.type(into_dtype(config.flow_dtype))
+    return model
+def load_text_encoders(config: ModelSpec) -> tuple[HFEmbedder, HFEmbedder]:
+    clip = HFEmbedder(
+        config.clip_path,
+        max_length=77,
+        torch_dtype=into_dtype(config.text_enc_dtype),
+        device=into_device(config.text_enc_device).index or 0,
+        is_clip=True,
+        quantization_dtype=config.clip_quantization_dtype,
+    )
+    t5 = HFEmbedder(
+        config.text_enc_path,
+        max_length=config.text_enc_max_length,
+        torch_dtype=into_dtype(config.text_enc_dtype),
+        device=into_device(config.text_enc_device).index or 0,
+        quantization_dtype=config.text_enc_quantization_dtype,
+    )
+    return clip, t5
+def load_autoencoder(config: ModelSpec) -> AutoEncoder:
+    ckpt_path = config.ae_path
+    with torch.device("meta" if ckpt_path is not None else config.ae_device):
+        ae = AutoEncoder(config.ae_params).to(into_dtype(config.ae_dtype))
+    if ckpt_path is not None:
+        sd = load_sft(ckpt_path, device=str(config.ae_device))
+        missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
+        print_load_warning(missing, unexpected)
+    ae.to(device=into_device(config.ae_device), dtype=into_dtype(config.ae_dtype))
+    if config.ae_quantization_dtype is not None:
+        from float8_quantize import recursive_swap_linears
+        recursive_swap_linears(ae)
+    if config.offload_vae:
+        ae.to("cpu")
+        torch.cuda.empty_cache()
+    return ae
+class LoadedModels(BaseModel):
+    flow: Flux
+    ae: AutoEncoder
+    clip: HFEmbedder
+    t5: HFEmbedder
+    config: ModelSpec
+    model_config = {
+        "arbitrary_types_allowed": True,
+        "use_enum_values": True,
+    }
+def load_models_from_config_path(
+    path: str,
+) -> LoadedModels:
+    config = load_config_from_path(path)
+    clip, t5 = load_text_encoders(config)
+    return LoadedModels(
+        flow=load_flow_model(config),
+        ae=load_autoencoder(config),
+        clip=clip,
+        t5=t5,
+        config=config,
+    )
+def load_models_from_config(config: ModelSpec) -> LoadedModels:
+    clip, t5 = load_text_encoders(config)
+    return LoadedModels(
+        flow=load_flow_model(config),
+        ae=load_autoencoder(config),
+        clip=clip,
+        t5=t5,
+        config=config,
+    )