Upload 15 files

Browse files

Files changed (9) hide show

fpack_cache_latents.py +124 -54
fpack_generate_video.py +438 -317
fpack_train_network.py +617 -0
hv_train.py +1721 -0
hv_train_network.py +0 -0
wan_cache_latents.py +177 -0
wan_cache_text_encoder_outputs.py +107 -0
wan_generate_video.py +1902 -0
wan_train_network.py +444 -0

fpack_cache_latents.py CHANGED Viewed

@@ -2,13 +2,14 @@ import argparse
 import logging
 import math
 import os
-from typing import List
 import numpy as np
 import torch
 import torch.nn.functional as F
 from tqdm import tqdm
 from transformers import SiglipImageProcessor, SiglipVisionModel
 from dataset import config_utils
 from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
@@ -28,15 +29,20 @@ def encode_and_save_batch(
     feature_extractor: SiglipImageProcessor,
     image_encoder: SiglipVisionModel,
     batch: List[ItemInfo],
-    latent_window_size: int,
     vanilla_sampling: bool = False,
     one_frame: bool = False,
 ):
     """Encode a batch of original RGB videos and save FramePack section caches."""
     if one_frame:
-        encode_and_save_batch_one_frame(vae, feature_extractor, image_encoder, batch, latent_window_size, vanilla_sampling)
         return
     # Stack batch into tensor (B,C,F,H,W) in RGB order
     contents = torch.stack([torch.from_numpy(item.content) for item in batch])
     if len(contents.shape) == 4:
@@ -238,34 +244,68 @@ def encode_and_save_batch_one_frame(
     feature_extractor: SiglipImageProcessor,
     image_encoder: SiglipVisionModel,
     batch: List[ItemInfo],
-    latent_window_size: int,
     vanilla_sampling: bool = False,
 ):
     # item.content: target image (H, W, C)
-    # item.control_content: start image (H, W, C)
-    # Stack batch into tensor (B,F,H,W,C) in RGB order.
-    contents = torch.stack(
-        [torch.stack([torch.from_numpy(item.control_content), torch.from_numpy(item.content)]) for item in batch]
-    )
     contents = contents.permute(0, 4, 1, 2, 3).contiguous()  # B, C, F, H, W
     contents = contents.to(vae.device, dtype=vae.dtype)
     contents = contents / 127.5 - 1.0  # normalize to [-1, 1]
-    height, width = contents.shape[3], contents.shape[4]
     if height < 8 or width < 8:
         item = batch[0]  # other items should have the same size
         raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
-    # VAE encode (list of tensor -> stack)
-    start_latents = hunyuan.vae_encode(contents[:, :, 0:1], vae)  # include scaling factor
-    start_latents = start_latents.to("cpu")  # (B, C, 1, H/8, W/8)
-    latents = hunyuan.vae_encode(contents[:, :, 1:], vae)  # include scaling factor
-    latents = latents.to("cpu")  # (B, C, 1, H/8, W/8)
     # Vision encoding per‑item (once): use control content because it is the start image
-    images = [item.control_content for item in batch]  # list of [H, W, C]
     # encode image with image encoder
     image_embeddings = []
@@ -276,56 +316,74 @@ def encode_and_save_batch_one_frame(
     image_embeddings = torch.cat(image_embeddings, dim=0)  # B, LEN, 1152
     image_embeddings = image_embeddings.to("cpu")  # Save memory
-    # history latents is always zeroes for one frame training
-    history_latents = torch.zeros(
-        (1, latents.shape[1], 1 + 2 + 16, latents.shape[3], latents.shape[4]), dtype=latents.dtype
-    )  # C=16 for HY
-    # indices generation (same as inference)
-    indices = torch.arange(0, sum([1, latent_window_size, 1, 2, 16])).unsqueeze(0)
-    (
-        clean_latent_indices_pre,  # Index for start_latent
-        latent_indices,  # Indices for the target latents to predict
-        clean_latent_indices_post,  # Index for the most recent history frame
-        clean_latent_2x_indices,  # Indices for the next 2 history frames
-        clean_latent_4x_indices,  # Indices for the next 16 history frames
-    ) = indices.split([1, latent_window_size, 1, 2, 16], dim=1)
-    # Indices for clean_latents (start + recent history)
-    latent_indices = latent_indices[:, -1:]  # Only the last index is used for one frame training
-    clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
-    # clean latents preparation for all items (emulating inference)
-    clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
     for b, item in enumerate(batch):
-        original_latent_cache_path = item.latent_cache_path
         # clean latents preparation (emulating inference)
-        clean_latents_pre = start_latents[b : b + 1]
-        clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)  # Combine start frame + placeholder
         # Target latents for this section (ground truth)
-        target_latents = latents[b : b + 1]
         # save cache (file path is inside item.latent_cache_path pattern), remove batch dim
         save_latent_cache_framepack(
             item_info=item,
-            latent=target_latents.squeeze(0),  # Ground truth for this section
-            latent_indices=latent_indices.squeeze(0),  # Indices for the ground truth section
-            clean_latents=clean_latents.squeeze(0),  # Start frame + history placeholder
-            clean_latent_indices=clean_latent_indices.squeeze(0),  # Indices for start frame + history placeholder
-            clean_latents_2x=clean_latents_2x.squeeze(0),  # History placeholder
-            clean_latent_2x_indices=clean_latent_2x_indices.squeeze(0),  # Indices for history placeholder
-            clean_latents_4x=clean_latents_4x.squeeze(0),  # History placeholder
-            clean_latent_4x_indices=clean_latent_4x_indices.squeeze(0),  # Indices for history placeholder
             image_embeddings=image_embeddings[b],
         )
 def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     parser.add_argument("--image_encoder", type=str, required=True, help="Image encoder (CLIP) checkpoint path or directory")
-    parser.add_argument("--latent_window_size", type=int, default=9, help="FramePack latent window size (default 9)")
     parser.add_argument(
         "--f1",
         action="store_true",
@@ -336,6 +394,16 @@ def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.Argument
         action="store_true",
         help="Generate cache for one frame training (single frame, single section). latent_window_size is used as the index of the target frame.",
     )
     return parser
@@ -373,7 +441,9 @@ def main(args: argparse.Namespace):
     # encoding closure
     def encode(batch: List[ItemInfo]):
-        encode_and_save_batch(vae, feature_extractor, image_encoder, batch, args.latent_window_size, args.f1, args.one_frame)
     # reuse core loop from cache_latents with no change
     encode_datasets_framepack(datasets, encode, args)
@@ -403,7 +473,7 @@ def encode_datasets_framepack(datasets: list[BaseDataset], encode: callable, arg
                     all_existing = os.path.exists(item.latent_cache_path)
                 else:
                     latent_f = (item.frame_count - 1) // 4 + 1
-                    num_sections = max(1, math.floor((latent_f - 1) / args.latent_window_size))  # min 1 section
                     all_existing = True
                     for sec in range(num_sections):
                         p = append_section_idx_to_latent_cache_path(item.latent_cache_path, sec)

 import logging
 import math
 import os
+from typing import List, Optional
 import numpy as np
 import torch
 import torch.nn.functional as F
 from tqdm import tqdm
 from transformers import SiglipImageProcessor, SiglipVisionModel
+from PIL import Image
 from dataset import config_utils
 from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
     feature_extractor: SiglipImageProcessor,
     image_encoder: SiglipVisionModel,
     batch: List[ItemInfo],
     vanilla_sampling: bool = False,
     one_frame: bool = False,
+    one_frame_no_2x: bool = False,
+    one_frame_no_4x: bool = False,
 ):
     """Encode a batch of original RGB videos and save FramePack section caches."""
     if one_frame:
+        encode_and_save_batch_one_frame(
+            vae, feature_extractor, image_encoder, batch, vanilla_sampling, one_frame_no_2x, one_frame_no_4x
+        )
         return
+    latent_window_size = batch[0].fp_latent_window_size  # all items should have the same window size
     # Stack batch into tensor (B,C,F,H,W) in RGB order
     contents = torch.stack([torch.from_numpy(item.content) for item in batch])
     if len(contents.shape) == 4:
     feature_extractor: SiglipImageProcessor,
     image_encoder: SiglipVisionModel,
     batch: List[ItemInfo],
     vanilla_sampling: bool = False,
+    one_frame_no_2x: bool = False,
+    one_frame_no_4x: bool = False,
 ):
     # item.content: target image (H, W, C)
+    # item.control_content: list of images (H, W, C)
+    # Stack batch into tensor (B,F,H,W,C) in RGB order. The numbers of control content for each item are the same.
+    contents = []
+    content_masks: list[list[Optional[torch.Tensor]]] = []
+    for item in batch:
+        item_contents = item.control_content + [item.content]
+        item_masks = []
+        for i, c in enumerate(item_contents):
+            if c.shape[-1] == 4:  # RGBA
+                item_contents[i] = c[..., :3]  # remove alpha channel from content
+                alpha = c[..., 3]  # extract alpha channel
+                mask_image = Image.fromarray(alpha, mode="L")
+                width, height = mask_image.size
+                mask_image = mask_image.resize((width // 8, height // 8), Image.LANCZOS)
+                mask_image = np.array(mask_image)  # PIL to numpy, HWC
+                mask_image = torch.from_numpy(mask_image).float() / 255.0  # 0 to 1.0, HWC
+                mask_image = mask_image.squeeze(-1)  # HWC -> HW
+                mask_image = mask_image.unsqueeze(0).unsqueeze(0).unsqueeze(0)  # HW -> 111HW (BCFHW)
+                mask_image = mask_image.to(torch.float32)
+                content_mask = mask_image
+            else:
+                content_mask = None
+            item_masks.append(content_mask)
+        item_contents = [torch.from_numpy(c) for c in item_contents]
+        contents.append(torch.stack(item_contents, dim=0))  # list of [F, H, W, C]
+        content_masks.append(item_masks)
+    contents = torch.stack(contents, dim=0)  # B, F, H, W, C. F is control frames + target frame
     contents = contents.permute(0, 4, 1, 2, 3).contiguous()  # B, C, F, H, W
     contents = contents.to(vae.device, dtype=vae.dtype)
     contents = contents / 127.5 - 1.0  # normalize to [-1, 1]
+    height, width = contents.shape[-2], contents.shape[-1]
     if height < 8 or width < 8:
         item = batch[0]  # other items should have the same size
         raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
+    # VAE encode: we need to encode one frame at a time because VAE encoder has stride=4 for the time dimension except for the first frame.
+    latents = [hunyuan.vae_encode(contents[:, :, idx : idx + 1], vae).to("cpu") for idx in range(contents.shape[2])]
+    latents = torch.cat(latents, dim=2)  # B, C, F, H/8, W/8
+    # apply alphas to latents
+    for b, item in enumerate(batch):
+        for i, content_mask in enumerate(content_masks[b]):
+            if content_mask is not None:
+                # apply mask to the latents
+                # print(f"Applying content mask for item {item.item_key}, frame {i}")
+                latents[b : b + 1, :, i : i + 1] *= content_mask
     # Vision encoding per‑item (once): use control content because it is the start image
+    images = [item.control_content[0] for item in batch]  # list of [H, W, C]
     # encode image with image encoder
     image_embeddings = []
     image_embeddings = torch.cat(image_embeddings, dim=0)  # B, LEN, 1152
     image_embeddings = image_embeddings.to("cpu")  # Save memory
+    # save cache for each item in the batch
     for b, item in enumerate(batch):
+        # indices generation (same as inference): each item may have different clean_latent_indices, so we generate them per item
+        clean_latent_indices = item.fp_1f_clean_indices  # list of indices for clean latents
+        if clean_latent_indices is None or len(clean_latent_indices) == 0:
+            logger.warning(
+                f"Item {item.item_key} has no clean_latent_indices defined, using default indices for one frame training."
+            )
+            clean_latent_indices = [0]
+        if not item.fp_1f_no_post:
+            clean_latent_indices = clean_latent_indices + [1 + item.fp_latent_window_size]
+        clean_latent_indices = torch.Tensor(clean_latent_indices).long()  #  N
+        latent_index = torch.Tensor([item.fp_1f_target_index]).long()  #  1
+        # zero values is not needed to cache even if one_frame_no_2x or 4x is False
+        clean_latents_2x = None
+        clean_latents_4x = None
+        if one_frame_no_2x:
+            clean_latent_2x_indices = None
+        else:
+            index = 1 + item.fp_latent_window_size + 1
+            clean_latent_2x_indices = torch.arange(index, index + 2)  #  2
+        if one_frame_no_4x:
+            clean_latent_4x_indices = None
+        else:
+            index = 1 + item.fp_latent_window_size + 1 + 2
+            clean_latent_4x_indices = torch.arange(index, index + 16)  #  16
         # clean latents preparation (emulating inference)
+        clean_latents = latents[b, :, :-1]  # C, F, H, W
+        if not item.fp_1f_no_post:
+            # If zero post is enabled, we need to add a zero frame at the end
+            clean_latents = F.pad(clean_latents, (0, 0, 0, 0, 0, 1), value=0.0)  # C, F+1, H, W
         # Target latents for this section (ground truth)
+        target_latents = latents[b, :, -1:]  # C, 1, H, W
+        print(f"Saving cache for item {item.item_key} at {item.latent_cache_path}. no_post: {item.fp_1f_no_post}")
+        print(f"  Clean latent indices: {clean_latent_indices}, latent index: {latent_index}")
+        print(f"  Clean latents: {clean_latents.shape}, target latents: {target_latents.shape}")
+        print(f"  Clean latents 2x indices: {clean_latent_2x_indices}, clean latents 4x indices: {clean_latent_4x_indices}")
+        print(
+            f"  Clean latents 2x: {clean_latents_2x.shape if clean_latents_2x is not None else 'None'}, "
+            f"Clean latents 4x: {clean_latents_4x.shape if clean_latents_4x is not None else 'None'}"
+        )
+        print(f"  Image embeddings: {image_embeddings[b].shape}")
         # save cache (file path is inside item.latent_cache_path pattern), remove batch dim
         save_latent_cache_framepack(
             item_info=item,
+            latent=target_latents,  # Ground truth for this section
+            latent_indices=latent_index,  # Indices for the ground truth section
+            clean_latents=clean_latents,  # Start frame + history placeholder
+            clean_latent_indices=clean_latent_indices,  # Indices for start frame + history placeholder
+            clean_latents_2x=clean_latents_2x,  # History placeholder
+            clean_latent_2x_indices=clean_latent_2x_indices,  # Indices for history placeholder
+            clean_latents_4x=clean_latents_4x,  # History placeholder
+            clean_latent_4x_indices=clean_latent_4x_indices,  # Indices for history placeholder
             image_embeddings=image_embeddings[b],
         )
 def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     parser.add_argument("--image_encoder", type=str, required=True, help="Image encoder (CLIP) checkpoint path or directory")
     parser.add_argument(
         "--f1",
         action="store_true",
         action="store_true",
         help="Generate cache for one frame training (single frame, single section). latent_window_size is used as the index of the target frame.",
     )
+    parser.add_argument(
+        "--one_frame_no_2x",
+        action="store_true",
+        help="Do not use clean_latents_2x and clean_latent_2x_indices for one frame training.",
+    )
+    parser.add_argument(
+        "--one_frame_no_4x",
+        action="store_true",
+        help="Do not use clean_latents_4x and clean_latent_4x_indices for one frame training.",
+    )
     return parser
     # encoding closure
     def encode(batch: List[ItemInfo]):
+        encode_and_save_batch(
+            vae, feature_extractor, image_encoder, batch, args.f1, args.one_frame, args.one_frame_no_2x, args.one_frame_no_4x
+        )
     # reuse core loop from cache_latents with no change
     encode_datasets_framepack(datasets, encode, args)
                     all_existing = os.path.exists(item.latent_cache_path)
                 else:
                     latent_f = (item.frame_count - 1) // 4 + 1
+                    num_sections = max(1, math.floor((latent_f - 1) / item.fp_latent_window_size))  # min 1 section
                     all_existing = True
                     for sec in range(num_sections):
                         p = append_section_idx_to_latent_cache_path(item.latent_cache_path, sec)

fpack_generate_video.py CHANGED Viewed

@@ -114,20 +114,17 @@ def parse_args() -> argparse.Namespace:
         "--one_frame_inference",
         type=str,
         default=None,
-        help="one frame inference, default is None, comma separated values from 'zero_post', 'no_2x', 'no_4x' and 'no_post'.",
     )
     parser.add_argument(
-        "--image_mask_path",
-        type=str,
-        default=None,
-        help="path to image mask for one frame inference. If specified, it will be used as mask for input image.",
     )
     parser.add_argument(
-        "--end_image_mask_path",
         type=str,
         default=None,
         nargs="*",
-        help="path to end (reference) image mask for one frame inference. If specified, it will be used as mask for end image.",
     )
     parser.add_argument("--fps", type=int, default=30, help="video fps, default is 30")
     parser.add_argument("--infer_steps", type=int, default=25, help="number of inference steps, default is 25")
@@ -154,7 +151,7 @@ def parse_args() -> argparse.Namespace:
         default=None,
         help="path to image for image2video inference. If `;;;` is used, it will be used as section images. The notation is same as `--prompt`.",
     )
-    parser.add_argument("--end_image_path", type=str, nargs="*", default=None, help="path to end image for image2video inference")
     parser.add_argument(
         "--latent_paddings",
         type=str,
@@ -180,6 +177,16 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--fp8", action="store_true", help="use fp8 for DiT model")
     parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT, only for fp8")
     # parser.add_argument("--fp8_fast", action="store_true", help="Enable fast FP8 arithmetic (RTX 4XXX+), only for fp8_scaled")
     parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
     parser.add_argument(
         "--device", type=str, default=None, help="device to use for inference. If None, use CUDA if available, otherwise use CPU"
@@ -248,9 +255,9 @@ def parse_prompt_line(line: str) -> Dict[str, Any]:
     # Create dictionary of overrides
     overrides = {"prompt": prompt}
-    # Initialize end_image_path and end_image_mask_path as a list to accommodate multiple paths
-    overrides["end_image_path"] = []
-    overrides["end_image_mask_path"] = []
     for part in parts[1:]:
         if not part.strip():
@@ -276,8 +283,8 @@ def parse_prompt_line(line: str) -> Dict[str, Any]:
         #     overrides["flow_shift"] = float(value)
         elif option == "i":
             overrides["image_path"] = value
-        elif option == "im":
-            overrides["image_mask_path"] = value
         # elif option == "cn":
         #     overrides["control_path"] = value
         elif option == "n":
@@ -285,17 +292,19 @@ def parse_prompt_line(line: str) -> Dict[str, Any]:
         elif option == "vs":  # video_sections
             overrides["video_sections"] = int(value)
         elif option == "ei":  # end_image_path
-            overrides["end_image_path"].append(value)
-        elif option == "eim":  # end_image_mask_path
-            overrides["end_image_mask_path"].append(value)
         elif option == "of":  # one_frame_inference
             overrides["one_frame_inference"] = value
-    # If no end_image_path was provided, remove the empty list
-    if not overrides["end_image_path"]:
-        del overrides["end_image_path"]
-    if not overrides["end_image_mask_path"]:
-        del overrides["end_image_mask_path"]
     return overrides
@@ -366,6 +375,13 @@ def load_dit_model(args: argparse.Namespace, device: torch.device) -> HunyuanVid
     # do not fp8 optimize because we will merge LoRA weights
     model = load_packed_model(device, args.dit, args.attn_mode, loading_device)
     return model
@@ -558,30 +574,44 @@ def prepare_i2v_inputs(
     # prepare image
     def preprocess_image(image_path: str):
-        image = Image.open(image_path).convert("RGB")
         image_np = np.array(image)  # PIL to numpy, HWC
         image_np = image_video_dataset.resize_image_to_bucket(image_np, (width, height))
         image_tensor = torch.from_numpy(image_np).float() / 127.5 - 1.0  # -1 to 1.0, HWC
         image_tensor = image_tensor.permute(2, 0, 1)[None, :, None]  # HWC -> CHW -> NCFHW, N=1, C=3, F=1
-        return image_tensor, image_np
     section_image_paths = parse_section_strings(args.image_path)
     section_images = {}
     for index, image_path in section_image_paths.items():
-        img_tensor, img_np = preprocess_image(image_path)
         section_images[index] = (img_tensor, img_np)
     # check end images
-    if args.end_image_path is not None and len(args.end_image_path) > 0:
-        end_image_tensors = []
-        for end_img_path in args.end_image_path:
-            end_image_tensor, _ = preprocess_image(end_img_path)
-            end_image_tensors.append(end_image_tensor)
     else:
-        end_image_tensors = None
     # configure negative prompt
     n_prompt = args.negative_prompt if args.negative_prompt else ""
@@ -644,6 +674,7 @@ def prepare_i2v_inputs(
     image_encoder.to(device)
     # encode image with image encoder
     section_image_encoder_last_hidden_states = {}
     for index, (img_tensor, img_np) in section_images.items():
         with torch.no_grad():
@@ -666,14 +697,14 @@ def prepare_i2v_inputs(
         start_latent = hunyuan.vae_encode(img_tensor, vae).cpu()
         section_start_latents[index] = start_latent
-    # end_latent = hunyuan.vae_encode(end_image_tensor, vae).cpu() if end_image_tensor is not None else None
-    if end_image_tensors is not None:
-        end_latents = []
-        for end_image_tensor in end_image_tensors:
-            end_latent = hunyuan.vae_encode(end_image_tensor, vae).cpu()
-            end_latents.append(end_latent)
-    else:
-        end_latents = None
     vae.to("cpu")  # move VAE to CPU to save memory
     clean_memory_on_device(device)
@@ -710,7 +741,7 @@ def prepare_i2v_inputs(
         }
         arg_c_img[index] = arg_c_img_i
-    return height, width, video_seconds, arg_c, arg_null, arg_c_img, end_latents
 # def setup_scheduler(args: argparse.Namespace, config, device: torch.device) -> Tuple[Any, torch.Tensor]:
@@ -930,13 +961,15 @@ def generate(
     if shared_models is not None:
         # Use shared models and encoded data
         vae = shared_models.get("vae")
-        height, width, video_seconds, context, context_null, context_img, end_latents = prepare_i2v_inputs(
-            args, device, vae, shared_models
         )
     else:
         # prepare inputs without shared models
         vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device)
-        height, width, video_seconds, context, context_null, context_img, end_latents = prepare_i2v_inputs(args, device, vae)
     if shared_models is None or "model" not in shared_models:
         # load DiT model
@@ -986,294 +1019,231 @@ def generate(
         for mode in args.one_frame_inference.split(","):
             one_frame_inference.add(mode.strip())
-    # prepare history latents
-    history_latents = torch.zeros((1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32)
-    if end_latents is not None and not f1_mode:
-        logger.info(f"Use end image(s): {args.end_image_path}")
-        for i, end_latent in enumerate(end_latents):
-            history_latents[:, :, i + 1 : i + 2] = end_latent.to(history_latents)
-    # prepare clean latents and indices
-    if not f1_mode:
-        # Inverted Anti-drifting
-        total_generated_latent_frames = 0
-        latent_paddings = reversed(range(total_latent_sections))
-        if total_latent_sections > 4 and one_frame_inference is None:
-            # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
-            # items looks better than expanding it when total_latent_sections > 4
-            # One can try to remove below trick and just
-            # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
-            # 4 sections: 3, 2, 1, 0. 50 sections: 3, 2, 2, ... 2, 1, 0
-            latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
-        if args.latent_paddings is not None:
-            # parse user defined latent paddings
-            user_latent_paddings = [int(x) for x in args.latent_paddings.split(",")]
-            if len(user_latent_paddings) < total_latent_sections:
-                print(
-                    f"User defined latent paddings length {len(user_latent_paddings)} does not match total sections {total_latent_sections}."
-                )
-                print(f"Use default paddings instead for unspecified sections.")
-                latent_paddings[: len(user_latent_paddings)] = user_latent_paddings
-            elif len(user_latent_paddings) > total_latent_sections:
-                print(
-                    f"User defined latent paddings length {len(user_latent_paddings)} is greater than total sections {total_latent_sections}."
-                )
-                print(f"Use only first {total_latent_sections} paddings instead.")
-                latent_paddings = user_latent_paddings[:total_latent_sections]
-            else:
-                latent_paddings = user_latent_paddings
     else:
-        start_latent = context_img[0]["start_latent"]
-        history_latents = torch.cat([history_latents, start_latent], dim=2)
-        total_generated_latent_frames = 1  # a bit hacky, but we employ the same logic as in official code
-        latent_paddings = [0] * total_latent_sections  # dummy paddings for F1 mode
-    latent_paddings = list(latent_paddings)  # make sure it's a list
-    for loop_index in range(total_latent_sections):
-        latent_padding = latent_paddings[loop_index]
         if not f1_mode:
             # Inverted Anti-drifting
-            section_index_reverse = loop_index  # 0, 1, 2, 3
-            section_index = total_latent_sections - 1 - section_index_reverse  # 3, 2, 1, 0
-            section_index_from_last = -(section_index_reverse + 1)  # -1, -2, -3, -4
-            is_last_section = section_index == 0
-            is_first_section = section_index_reverse == 0
-            latent_padding_size = latent_padding * latent_window_size
-            logger.info(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}")
-        else:
-            section_index = loop_index  # 0, 1, 2, 3
-            section_index_from_last = section_index - total_latent_sections  # -4, -3, -2, -1
-            is_last_section = loop_index == total_latent_sections - 1
-            is_first_section = loop_index == 0
-            latent_padding_size = 0  # dummy padding for F1 mode
-        # select start latent
-        if section_index_from_last in context_img:
-            image_index = section_index_from_last
-        elif section_index in context_img:
-            image_index = section_index
-        else:
-            image_index = 0
-        start_latent = context_img[image_index]["start_latent"]
-        image_path = context_img[image_index]["image_path"]
-        if image_index != 0:  # use section image other than section 0
-            logger.info(f"Apply experimental section image, latent_padding_size = {latent_padding_size}, image_path = {image_path}")
-        if not f1_mode:
-            # Inverted Anti-drifting
-            indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
-            (
-                clean_latent_indices_pre,
-                blank_indices,
-                latent_indices,
-                clean_latent_indices_post,
-                clean_latent_2x_indices,
-                clean_latent_4x_indices,
-            ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
-            clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
-            clean_latents_pre = start_latent.to(history_latents)
-            clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split(
-                [1, 2, 16], dim=2
-            )
-            clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
-            if end_latents is not None:
-                clean_latents = torch.cat([clean_latents_pre, history_latents[:, :, : len(end_latents)]], dim=2)
-                clean_latent_indices_extended = torch.zeros(1, 1 + len(end_latents), dtype=clean_latent_indices.dtype)
-                clean_latent_indices_extended[:, :2] = clean_latent_indices
-                clean_latent_indices = clean_latent_indices_extended
-        else:
-            # F1 mode
-            indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
-            (
-                clean_latent_indices_start,
-                clean_latent_4x_indices,
-                clean_latent_2x_indices,
-                clean_latent_1x_indices,
-                latent_indices,
-            ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
-            clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
-            clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]) :, :, :].split(
-                [16, 2, 1], dim=2
-            )
-            clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
-        # if use_teacache:
-        #     transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
-        # else:
-        #     transformer.initialize_teacache(enable_teacache=False)
-        # prepare conditioning inputs
-        if section_index_from_last in context:
-            prompt_index = section_index_from_last
-        elif section_index in context:
-            prompt_index = section_index
         else:
-            prompt_index = 0
-        context_for_index = context[prompt_index]
-        # if args.section_prompts is not None:
-        logger.info(f"Section {section_index}: {context_for_index['prompt']}")
-        llama_vec = context_for_index["llama_vec"].to(device, dtype=torch.bfloat16)
-        llama_attention_mask = context_for_index["llama_attention_mask"].to(device)
-        clip_l_pooler = context_for_index["clip_l_pooler"].to(device, dtype=torch.bfloat16)
-        image_encoder_last_hidden_state = context_img[image_index]["image_encoder_last_hidden_state"].to(
-            device, dtype=torch.bfloat16
-        )
-        llama_vec_n = context_null["llama_vec"].to(device, dtype=torch.bfloat16)
-        llama_attention_mask_n = context_null["llama_attention_mask"].to(device)
-        clip_l_pooler_n = context_null["clip_l_pooler"].to(device, dtype=torch.bfloat16)
-        # call DiT model to generate latents
-        sample_num_frames = num_frames
-        if one_frame_inference is not None:
-            # one frame inference
-            latent_indices = latent_indices[:, -1:]  # only use the last frame (default)
-            sample_num_frames = 1
-            def get_latent_mask(mask_path: str):
-                mask_image = Image.open(mask_path).convert("L")  # grayscale
-                mask_image = mask_image.resize((width // 8, height // 8), Image.LANCZOS)
-                mask_image = np.array(mask_image)  # PIL to numpy, HWC
-                mask_image = torch.from_numpy(mask_image).float() / 255.0  # 0 to 1.0, HWC
-                mask_image = mask_image.squeeze(-1)  # HWC -> HW
-                mask_image = mask_image.unsqueeze(0).unsqueeze(0)  # HW -> 11HW
-                mask_image = mask_image.to(clean_latents)
-                return mask_image
-            if args.image_mask_path is not None:
-                mask_image = get_latent_mask(args.image_mask_path)
-                logger.info(f"Apply mask for clean latents (start image): {args.image_mask_path}, shape: {mask_image.shape}")
-                clean_latents[:, :, 0, :, :] = clean_latents[:, :, 0, :, :] * mask_image
-            if args.end_image_mask_path is not None and len(args.end_image_mask_path) > 0:
-                # # apply mask for clean latents 1x (end image)
-                count = min(len(args.end_image_mask_path), len(end_latents))
-                for i in range(count):
-                    mask_image = get_latent_mask(args.end_image_mask_path[i])
-                    logger.info(
-                        f"Apply mask for clean latents 1x (end image) for {i+1}: {args.end_image_mask_path[i]}, shape: {mask_image.shape}"
-                    )
-                    clean_latents[:, :, i + 1 : i + 2, :, :] = clean_latents[:, :, i + 1 : i + 2, :, :] * mask_image
-            for one_frame_param in one_frame_inference:
-                if one_frame_param.startswith("target_index="):
-                    target_index = int(one_frame_param.split("=")[1])
-                    latent_indices[:, 0] = target_index
-                    logger.info(f"Set index for target: {target_index}")
-                elif one_frame_param.startswith("start_index="):
-                    start_index = int(one_frame_param.split("=")[1])
-                    clean_latent_indices[:, 0] = start_index
-                    logger.info(f"Set index for clean latent pre (start image): {start_index}")
-                elif one_frame_param.startswith("history_index="):
-                    history_indices = one_frame_param.split("=")[1].split(";")
-                    i = 0
-                    while i < len(history_indices) and i < len(end_latents):
-                        history_index = int(history_indices[i])
-                        clean_latent_indices[:, 1 + i] = history_index
-                        i += 1
-                    while i < len(end_latents):
-                        clean_latent_indices[:, 1 + i] = history_index
-                        i += 1
-                    logger.info(f"Set index for clean latent post (end image): {history_indices}")
-            if "no_2x" in one_frame_inference:
-                clean_latents_2x = None
-                clean_latent_2x_indices = None
-                logger.info(f"No clean_latents_2x")
-            if "no_4x" in one_frame_inference:
-                clean_latents_4x = None
-                clean_latent_4x_indices = None
-                logger.info(f"No clean_latents_4x")
-            if "no_post" in one_frame_inference:
-                clean_latents = clean_latents[:, :, :1, :, :]
-                clean_latent_indices = clean_latent_indices[:, :1]
-                logger.info(f"No clean_latents post")
-            elif "zero_post" in one_frame_inference:
-                # zero out the history latents. this seems to prevent the images from corrupting
-                clean_latents[:, :, 1:, :, :] = torch.zeros_like(clean_latents[:, :, 1:, :, :])
-                logger.info(f"Zero out clean_latents post")
-            logger.info(
-                f"One frame inference. clean_latent: {clean_latents.shape} latent_indices: {latent_indices}, clean_latent_indices: {clean_latent_indices}, num_frames: {sample_num_frames}"
             )
-        generated_latents = sample_hunyuan(
-            transformer=model,
-            sampler=args.sample_solver,
-            width=width,
-            height=height,
-            frames=sample_num_frames,
-            real_guidance_scale=args.guidance_scale,
-            distilled_guidance_scale=args.embedded_cfg_scale,
-            guidance_rescale=args.guidance_rescale,
-            # shift=3.0,
-            num_inference_steps=args.infer_steps,
-            generator=seed_g,
-            prompt_embeds=llama_vec,
-            prompt_embeds_mask=llama_attention_mask,
-            prompt_poolers=clip_l_pooler,
-            negative_prompt_embeds=llama_vec_n,
-            negative_prompt_embeds_mask=llama_attention_mask_n,
-            negative_prompt_poolers=clip_l_pooler_n,
-            device=device,
-            dtype=torch.bfloat16,
-            image_embeddings=image_encoder_last_hidden_state,
-            latent_indices=latent_indices,
-            clean_latents=clean_latents,
-            clean_latent_indices=clean_latent_indices,
-            clean_latents_2x=clean_latents_2x,
-            clean_latent_2x_indices=clean_latent_2x_indices,
-            clean_latents_4x=clean_latents_4x,
-            clean_latent_4x_indices=clean_latent_4x_indices,
-        )
-        # concatenate generated latents
-        total_generated_latent_frames += int(generated_latents.shape[2])
-        if not f1_mode:
-            # Inverted Anti-drifting: prepend generated latents to history latents
-            if is_last_section:
-                generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
-                total_generated_latent_frames += 1
-            history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
-            real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
-        else:
-            # F1 mode: append generated latents to history latents
-            history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
-            real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
-        logger.info(f"Generated. Latent shape {real_history_latents.shape}")
-        # # TODO support saving intermediate video
-        # clean_memory_on_device(device)
-        # vae.to(device)
-        # if history_pixels is None:
-        #     history_pixels = hunyuan.vae_decode(real_history_latents, vae).cpu()
-        # else:
-        #     section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
-        #     overlapped_frames = latent_window_size * 4 - 3
-        #     current_pixels = hunyuan.vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
-        #     history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
-        # vae.to("cpu")
-        # # if not is_last_section:
-        # #     # save intermediate video
-        # #     save_video(history_pixels[0], args, total_generated_latent_frames)
-        # print(f"Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}")
-    if one_frame_inference is not None:
-        real_history_latents = real_history_latents[:, :, 1:, :, :]  # remove the first frame (start_latent)
     # Only clean up shared models if they were created within this function
     if shared_models is None:
@@ -1284,8 +1254,9 @@ def generate(
         model.to("cpu")
     # wait for 5 seconds until block swap is done
-    logger.info("Waiting for 5 seconds to finish block swap")
-    time.sleep(5)
     gc.collect()
     clean_memory_on_device(device)
@@ -1293,6 +1264,156 @@ def generate(
     return vae, real_history_latents
 def save_latent(latent: torch.Tensor, args: argparse.Namespace, height: int, width: int) -> str:
     """Save latent to file

         "--one_frame_inference",
         type=str,
         default=None,
+        help="one frame inference, default is None, comma separated values from 'no_2x', 'no_4x', 'no_post', 'control_indices' and 'target_index'.",
     )
     parser.add_argument(
+        "--control_image_path", type=str, default=None, nargs="*", help="path to control (reference) image for one frame inference."
     )
     parser.add_argument(
+        "--control_image_mask_path",
         type=str,
         default=None,
         nargs="*",
+        help="path to control (reference) image mask for one frame inference.",
     )
     parser.add_argument("--fps", type=int, default=30, help="video fps, default is 30")
     parser.add_argument("--infer_steps", type=int, default=25, help="number of inference steps, default is 25")
         default=None,
         help="path to image for image2video inference. If `;;;` is used, it will be used as section images. The notation is same as `--prompt`.",
     )
+    parser.add_argument("--end_image_path", type=str, default=None, help="path to end image for image2video inference")
     parser.add_argument(
         "--latent_paddings",
         type=str,
     parser.add_argument("--fp8", action="store_true", help="use fp8 for DiT model")
     parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT, only for fp8")
     # parser.add_argument("--fp8_fast", action="store_true", help="Enable fast FP8 arithmetic (RTX 4XXX+), only for fp8_scaled")
+    parser.add_argument(
+        "--rope_scaling_factor", type=float, default=0.5, help="RoPE scaling factor for high resolution (H/W), default is 0.5"
+    )
+    parser.add_argument(
+        "--rope_scaling_timestep_threshold",
+        type=int,
+        default=None,
+        help="RoPE scaling timestep threshold, default is None (disable), if set, RoPE scaling will be applied only for timesteps >= threshold, around 800 is good starting point",
+    )
     parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
     parser.add_argument(
         "--device", type=str, default=None, help="device to use for inference. If None, use CUDA if available, otherwise use CPU"
     # Create dictionary of overrides
     overrides = {"prompt": prompt}
+    # Initialize control_image_path and control_image_mask_path as a list to accommodate multiple paths
+    overrides["control_image_path"] = []
+    overrides["control_image_mask_path"] = []
     for part in parts[1:]:
         if not part.strip():
         #     overrides["flow_shift"] = float(value)
         elif option == "i":
             overrides["image_path"] = value
+        # elif option == "im":
+        #     overrides["image_mask_path"] = value
         # elif option == "cn":
         #     overrides["control_path"] = value
         elif option == "n":
         elif option == "vs":  # video_sections
             overrides["video_sections"] = int(value)
         elif option == "ei":  # end_image_path
+            overrides["end_image_path"] = value
+        elif option == "ci":  # control_image_path
+            overrides["control_image_path"].append(value)
+        elif option == "cim":  # control_image_mask_path
+            overrides["control_image_mask_path"].append(value)
         elif option == "of":  # one_frame_inference
             overrides["one_frame_inference"] = value
+    # If no control_image_path was provided, remove the empty list
+    if not overrides["control_image_path"]:
+        del overrides["control_image_path"]
+    if not overrides["control_image_mask_path"]:
+        del overrides["control_image_mask_path"]
     return overrides
     # do not fp8 optimize because we will merge LoRA weights
     model = load_packed_model(device, args.dit, args.attn_mode, loading_device)
+    # apply RoPE scaling factor
+    if args.rope_scaling_timestep_threshold is not None:
+        logger.info(
+            f"Applying RoPE scaling factor {args.rope_scaling_factor} for timesteps >= {args.rope_scaling_timestep_threshold}"
+        )
+        model.enable_rope_scaling(args.rope_scaling_timestep_threshold, args.rope_scaling_factor)
     return model
     # prepare image
     def preprocess_image(image_path: str):
+        image = Image.open(image_path)
+        if image.mode == "RGBA":
+            alpha = image.split()[-1]
+        else:
+            alpha = None
+        image = image.convert("RGB")
         image_np = np.array(image)  # PIL to numpy, HWC
         image_np = image_video_dataset.resize_image_to_bucket(image_np, (width, height))
         image_tensor = torch.from_numpy(image_np).float() / 127.5 - 1.0  # -1 to 1.0, HWC
         image_tensor = image_tensor.permute(2, 0, 1)[None, :, None]  # HWC -> CHW -> NCFHW, N=1, C=3, F=1
+        return image_tensor, image_np, alpha
     section_image_paths = parse_section_strings(args.image_path)
     section_images = {}
     for index, image_path in section_image_paths.items():
+        img_tensor, img_np, _ = preprocess_image(image_path)
         section_images[index] = (img_tensor, img_np)
+    # check end image
+    if args.end_image_path is not None:
+        end_image_tensor, _, _ = preprocess_image(args.end_image_path)
+    else:
+        end_image_tensor = None
     # check end images
+    if args.control_image_path is not None and len(args.control_image_path) > 0:
+        control_image_tensors = []
+        control_mask_images = []
+        for ctrl_image_path in args.control_image_path:
+            control_image_tensor, _, control_mask = preprocess_image(ctrl_image_path)
+            control_image_tensors.append(control_image_tensor)
+            control_mask_images.append(control_mask)
     else:
+        control_image_tensors = None
+        control_mask_images = None
     # configure negative prompt
     n_prompt = args.negative_prompt if args.negative_prompt else ""
     image_encoder.to(device)
     # encode image with image encoder
     section_image_encoder_last_hidden_states = {}
     for index, (img_tensor, img_np) in section_images.items():
         with torch.no_grad():
         start_latent = hunyuan.vae_encode(img_tensor, vae).cpu()
         section_start_latents[index] = start_latent
+    end_latent = hunyuan.vae_encode(end_image_tensor, vae).cpu() if end_image_tensor is not None else None
+    control_latents = None
+    if control_image_tensors is not None:
+        control_latents = []
+        for ctrl_image_tensor in control_image_tensors:
+            control_latent = hunyuan.vae_encode(ctrl_image_tensor, vae).cpu()
+            control_latents.append(control_latent)
     vae.to("cpu")  # move VAE to CPU to save memory
     clean_memory_on_device(device)
         }
         arg_c_img[index] = arg_c_img_i
+    return height, width, video_seconds, arg_c, arg_null, arg_c_img, end_latent, control_latents, control_mask_images
 # def setup_scheduler(args: argparse.Namespace, config, device: torch.device) -> Tuple[Any, torch.Tensor]:
     if shared_models is not None:
         # Use shared models and encoded data
         vae = shared_models.get("vae")
+        height, width, video_seconds, context, context_null, context_img, end_latent, control_latents, control_mask_images = (
+            prepare_i2v_inputs(args, device, vae, shared_models)
         )
     else:
         # prepare inputs without shared models
         vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device)
+        height, width, video_seconds, context, context_null, context_img, end_latent, control_latents, control_mask_images = (
+            prepare_i2v_inputs(args, device, vae)
+        )
     if shared_models is None or "model" not in shared_models:
         # load DiT model
         for mode in args.one_frame_inference.split(","):
             one_frame_inference.add(mode.strip())
+    if one_frame_inference is not None:
+        real_history_latents = generate_with_one_frame_inference(
+            args,
+            model,
+            context,
+            context_null,
+            context_img,
+            control_latents,
+            control_mask_images,
+            latent_window_size,
+            height,
+            width,
+            device,
+            seed_g,
+            one_frame_inference,
+        )
     else:
+        # prepare history latents
+        history_latents = torch.zeros((1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32)
+        if end_latent is not None and not f1_mode:
+            logger.info(f"Use end image(s): {args.end_image_path}")
+            history_latents[:, :, :1] = end_latent.to(history_latents)
+        # prepare clean latents and indices
         if not f1_mode:
             # Inverted Anti-drifting
+            total_generated_latent_frames = 0
+            latent_paddings = reversed(range(total_latent_sections))
+            if total_latent_sections > 4 and one_frame_inference is None:
+                # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
+                # items looks better than expanding it when total_latent_sections > 4
+                # One can try to remove below trick and just
+                # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
+                # 4 sections: 3, 2, 1, 0. 50 sections: 3, 2, 2, ... 2, 1, 0
+                latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
+            if args.latent_paddings is not None:
+                # parse user defined latent paddings
+                user_latent_paddings = [int(x) for x in args.latent_paddings.split(",")]
+                if len(user_latent_paddings) < total_latent_sections:
+                    print(
+                        f"User defined latent paddings length {len(user_latent_paddings)} does not match total sections {total_latent_sections}."
+                    )
+                    print(f"Use default paddings instead for unspecified sections.")
+                    latent_paddings[: len(user_latent_paddings)] = user_latent_paddings
+                elif len(user_latent_paddings) > total_latent_sections:
+                    print(
+                        f"User defined latent paddings length {len(user_latent_paddings)} is greater than total sections {total_latent_sections}."
+                    )
+                    print(f"Use only first {total_latent_sections} paddings instead.")
+                    latent_paddings = user_latent_paddings[:total_latent_sections]
+                else:
+                    latent_paddings = user_latent_paddings
         else:
+            start_latent = context_img[0]["start_latent"]
+            history_latents = torch.cat([history_latents, start_latent], dim=2)
+            total_generated_latent_frames = 1  # a bit hacky, but we employ the same logic as in official code
+            latent_paddings = [0] * total_latent_sections  # dummy paddings for F1 mode
+        latent_paddings = list(latent_paddings)  # make sure it's a list
+        for loop_index in range(total_latent_sections):
+            latent_padding = latent_paddings[loop_index]
+            if not f1_mode:
+                # Inverted Anti-drifting
+                section_index_reverse = loop_index  # 0, 1, 2, 3
+                section_index = total_latent_sections - 1 - section_index_reverse  # 3, 2, 1, 0
+                section_index_from_last = -(section_index_reverse + 1)  # -1, -2, -3, -4
+                is_last_section = section_index == 0
+                is_first_section = section_index_reverse == 0
+                latent_padding_size = latent_padding * latent_window_size
+                logger.info(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}")
+            else:
+                section_index = loop_index  # 0, 1, 2, 3
+                section_index_from_last = section_index - total_latent_sections  # -4, -3, -2, -1
+                is_last_section = loop_index == total_latent_sections - 1
+                is_first_section = loop_index == 0
+                latent_padding_size = 0  # dummy padding for F1 mode
+            # select start latent
+            if section_index_from_last in context_img:
+                image_index = section_index_from_last
+            elif section_index in context_img:
+                image_index = section_index
+            else:
+                image_index = 0
+            start_latent = context_img[image_index]["start_latent"]
+            image_path = context_img[image_index]["image_path"]
+            if image_index != 0:  # use section image other than section 0
+                logger.info(
+                    f"Apply experimental section image, latent_padding_size = {latent_padding_size}, image_path = {image_path}"
+                )
+            if not f1_mode:
+                # Inverted Anti-drifting
+                indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
+                (
+                    clean_latent_indices_pre,
+                    blank_indices,
+                    latent_indices,
+                    clean_latent_indices_post,
+                    clean_latent_2x_indices,
+                    clean_latent_4x_indices,
+                ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
+                clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+                clean_latents_pre = start_latent.to(history_latents)
+                clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split(
+                    [1, 2, 16], dim=2
+                )
+                clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
+            else:
+                # F1 mode
+                indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
+                (
+                    clean_latent_indices_start,
+                    clean_latent_4x_indices,
+                    clean_latent_2x_indices,
+                    clean_latent_1x_indices,
+                    latent_indices,
+                ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
+                clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
+                clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]) :, :, :].split(
+                    [16, 2, 1], dim=2
+                )
+                clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
+            # if use_teacache:
+            #     transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
+            # else:
+            #     transformer.initialize_teacache(enable_teacache=False)
+            # prepare conditioning inputs
+            if section_index_from_last in context:
+                prompt_index = section_index_from_last
+            elif section_index in context:
+                prompt_index = section_index
+            else:
+                prompt_index = 0
+            context_for_index = context[prompt_index]
+            # if args.section_prompts is not None:
+            logger.info(f"Section {section_index}: {context_for_index['prompt']}")
+            llama_vec = context_for_index["llama_vec"].to(device, dtype=torch.bfloat16)
+            llama_attention_mask = context_for_index["llama_attention_mask"].to(device)
+            clip_l_pooler = context_for_index["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+            image_encoder_last_hidden_state = context_img[image_index]["image_encoder_last_hidden_state"].to(
+                device, dtype=torch.bfloat16
             )
+            llama_vec_n = context_null["llama_vec"].to(device, dtype=torch.bfloat16)
+            llama_attention_mask_n = context_null["llama_attention_mask"].to(device)
+            clip_l_pooler_n = context_null["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+            generated_latents = sample_hunyuan(
+                transformer=model,
+                sampler=args.sample_solver,
+                width=width,
+                height=height,
+                frames=num_frames,
+                real_guidance_scale=args.guidance_scale,
+                distilled_guidance_scale=args.embedded_cfg_scale,
+                guidance_rescale=args.guidance_rescale,
+                # shift=3.0,
+                num_inference_steps=args.infer_steps,
+                generator=seed_g,
+                prompt_embeds=llama_vec,
+                prompt_embeds_mask=llama_attention_mask,
+                prompt_poolers=clip_l_pooler,
+                negative_prompt_embeds=llama_vec_n,
+                negative_prompt_embeds_mask=llama_attention_mask_n,
+                negative_prompt_poolers=clip_l_pooler_n,
+                device=device,
+                dtype=torch.bfloat16,
+                image_embeddings=image_encoder_last_hidden_state,
+                latent_indices=latent_indices,
+                clean_latents=clean_latents,
+                clean_latent_indices=clean_latent_indices,
+                clean_latents_2x=clean_latents_2x,
+                clean_latent_2x_indices=clean_latent_2x_indices,
+                clean_latents_4x=clean_latents_4x,
+                clean_latent_4x_indices=clean_latent_4x_indices,
+            )
+            # concatenate generated latents
+            total_generated_latent_frames += int(generated_latents.shape[2])
+            if not f1_mode:
+                # Inverted Anti-drifting: prepend generated latents to history latents
+                if is_last_section:
+                    generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
+                    total_generated_latent_frames += 1
+                history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
+                real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
+            else:
+                # F1 mode: append generated latents to history latents
+                history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
+                real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
+            logger.info(f"Generated. Latent shape {real_history_latents.shape}")
+            # # TODO support saving intermediate video
+            # clean_memory_on_device(device)
+            # vae.to(device)
+            # if history_pixels is None:
+            #     history_pixels = hunyuan.vae_decode(real_history_latents, vae).cpu()
+            # else:
+            #     section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
+            #     overlapped_frames = latent_window_size * 4 - 3
+            #     current_pixels = hunyuan.vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
+            #     history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
+            # vae.to("cpu")
+            # # if not is_last_section:
+            # #     # save intermediate video
+            # #     save_video(history_pixels[0], args, total_generated_latent_frames)
+            # print(f"Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}")
     # Only clean up shared models if they were created within this function
     if shared_models is None:
         model.to("cpu")
     # wait for 5 seconds until block swap is done
+    if args.blocks_to_swap > 0:
+        logger.info("Waiting for 5 seconds to finish block swap")
+        time.sleep(5)
     gc.collect()
     clean_memory_on_device(device)
     return vae, real_history_latents
+def generate_with_one_frame_inference(
+    args: argparse.Namespace,
+    model: HunyuanVideoTransformer3DModelPacked,
+    context: Dict[int, Dict[str, torch.Tensor]],
+    context_null: Dict[str, torch.Tensor],
+    context_img: Dict[int, Dict[str, torch.Tensor]],
+    control_latents: Optional[List[torch.Tensor]],
+    control_mask_images: Optional[List[Optional[Image.Image]]],
+    latent_window_size: int,
+    height: int,
+    width: int,
+    device: torch.device,
+    seed_g: torch.Generator,
+    one_frame_inference: set[str],
+) -> torch.Tensor:
+    # one frame inference
+    sample_num_frames = 1
+    latent_indices = torch.zeros((1, 1), dtype=torch.int64)  # 1x1 latent index for target image
+    latent_indices[:, 0] = latent_window_size  # last of latent_window
+    def get_latent_mask(mask_image: Image.Image) -> torch.Tensor:
+        if mask_image.mode != "L":
+            mask_image = mask_image.convert("L")
+        mask_image = mask_image.resize((width // 8, height // 8), Image.LANCZOS)
+        mask_image = np.array(mask_image)  # PIL to numpy, HWC
+        mask_image = torch.from_numpy(mask_image).float() / 255.0  # 0 to 1.0, HWC
+        mask_image = mask_image.squeeze(-1)  # HWC -> HW
+        mask_image = mask_image.unsqueeze(0).unsqueeze(0).unsqueeze(0)  # HW -> 111HW (BCFHW)
+        mask_image = mask_image.to(torch.float32)
+        return mask_image
+    if control_latents is None or len(control_latents) == 0:
+        logger.info(f"No control images provided for one frame inference. Use zero latents for control images.")
+        control_latents = [torch.zeros(1, 16, 1, height // 8, width // 8, dtype=torch.float32)]
+    if "no_post" not in one_frame_inference:
+        # add zero latents as clean latents post
+        control_latents.append(torch.zeros((1, 16, 1, height // 8, width // 8), dtype=torch.float32))
+        logger.info(f"Add zero latents as clean latents post for one frame inference.")
+    # kisekaeichi and 1f-mc: both are using control images, but indices are different
+    clean_latents = torch.cat(control_latents, dim=2)  # (1, 16, num_control_images, H//8, W//8)
+    clean_latent_indices = torch.zeros((1, len(control_latents)), dtype=torch.int64)
+    if "no_post" not in one_frame_inference:
+        clean_latent_indices[:, -1] = 1 + latent_window_size  # default index for clean latents post
+    for i in range(len(control_latents)):
+        mask_image = None
+        if args.control_image_mask_path is not None and i < len(args.control_image_mask_path):
+            mask_image = get_latent_mask(Image.open(args.control_image_mask_path[i]))
+            logger.info(
+                f"Apply mask for clean latents 1x for {i + 1}: {args.control_image_mask_path[i]}, shape: {mask_image.shape}"
+            )
+        elif control_mask_images is not None and i < len(control_mask_images) and control_mask_images[i] is not None:
+            mask_image = get_latent_mask(control_mask_images[i])
+            logger.info(f"Apply mask for clean latents 1x for {i + 1} with alpha channel: {mask_image.shape}")
+        if mask_image is not None:
+            clean_latents[:, :, i : i + 1, :, :] = clean_latents[:, :, i : i + 1, :, :] * mask_image
+    for one_frame_param in one_frame_inference:
+        if one_frame_param.startswith("target_index="):
+            target_index = int(one_frame_param.split("=")[1])
+            latent_indices[:, 0] = target_index
+            logger.info(f"Set index for target: {target_index}")
+        elif one_frame_param.startswith("control_index="):
+            control_indices = one_frame_param.split("=")[1].split(";")
+            i = 0
+            while i < len(control_indices) and i < clean_latent_indices.shape[1]:
+                control_index = int(control_indices[i])
+                clean_latent_indices[:, i] = control_index
+                i += 1
+            logger.info(f"Set index for clean latent 1x: {control_indices}")
+    # "default" option does nothing, so we can skip it
+    if "default" in one_frame_inference:
+        pass
+    if "no_2x" in one_frame_inference:
+        clean_latents_2x = None
+        clean_latent_2x_indices = None
+        logger.info(f"No clean_latents_2x")
+    else:
+        clean_latents_2x = torch.zeros((1, 16, 2, height // 8, width // 8), dtype=torch.float32)
+        index = 1 + latent_window_size + 1
+        clean_latent_2x_indices = torch.arange(index, index + 2).unsqueeze(0)  #  2
+    if "no_4x" in one_frame_inference:
+        clean_latents_4x = None
+        clean_latent_4x_indices = None
+        logger.info(f"No clean_latents_4x")
+    else:
+        clean_latents_4x = torch.zeros((1, 16, 16, height // 8, width // 8), dtype=torch.float32)
+        index = 1 + latent_window_size + 1 + 2
+        clean_latent_4x_indices = torch.arange(index, index + 16).unsqueeze(0)  #  16
+    logger.info(
+        f"One frame inference. clean_latent: {clean_latents.shape} latent_indices: {latent_indices}, clean_latent_indices: {clean_latent_indices}, num_frames: {sample_num_frames}"
+    )
+    # prepare conditioning inputs
+    prompt_index = 0
+    image_index = 0
+    context_for_index = context[prompt_index]
+    logger.info(f"Prompt: {context_for_index['prompt']}")
+    llama_vec = context_for_index["llama_vec"].to(device, dtype=torch.bfloat16)
+    llama_attention_mask = context_for_index["llama_attention_mask"].to(device)
+    clip_l_pooler = context_for_index["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+    image_encoder_last_hidden_state = context_img[image_index]["image_encoder_last_hidden_state"].to(device, dtype=torch.bfloat16)
+    llama_vec_n = context_null["llama_vec"].to(device, dtype=torch.bfloat16)
+    llama_attention_mask_n = context_null["llama_attention_mask"].to(device)
+    clip_l_pooler_n = context_null["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+    generated_latents = sample_hunyuan(
+        transformer=model,
+        sampler=args.sample_solver,
+        width=width,
+        height=height,
+        frames=1,
+        real_guidance_scale=args.guidance_scale,
+        distilled_guidance_scale=args.embedded_cfg_scale,
+        guidance_rescale=args.guidance_rescale,
+        # shift=3.0,
+        num_inference_steps=args.infer_steps,
+        generator=seed_g,
+        prompt_embeds=llama_vec,
+        prompt_embeds_mask=llama_attention_mask,
+        prompt_poolers=clip_l_pooler,
+        negative_prompt_embeds=llama_vec_n,
+        negative_prompt_embeds_mask=llama_attention_mask_n,
+        negative_prompt_poolers=clip_l_pooler_n,
+        device=device,
+        dtype=torch.bfloat16,
+        image_embeddings=image_encoder_last_hidden_state,
+        latent_indices=latent_indices,
+        clean_latents=clean_latents,
+        clean_latent_indices=clean_latent_indices,
+        clean_latents_2x=clean_latents_2x,
+        clean_latent_2x_indices=clean_latent_2x_indices,
+        clean_latents_4x=clean_latents_4x,
+        clean_latent_4x_indices=clean_latent_4x_indices,
+    )
+    real_history_latents = generated_latents.to(clean_latents)
+    return real_history_latents
 def save_latent(latent: torch.Tensor, args: argparse.Namespace, height: int, width: int) -> str:
     """Save latent to file

fpack_train_network.py ADDED Viewed

	@@ -0,0 +1,617 @@

+import argparse
+import gc
+import math
+import time
+from typing import Optional
+from PIL import Image
+import numpy as np
+import torch
+import torchvision.transforms.functional as TF
+from tqdm import tqdm
+from accelerate import Accelerator, init_empty_weights
+from dataset import image_video_dataset
+from dataset.image_video_dataset import ARCHITECTURE_FRAMEPACK, ARCHITECTURE_FRAMEPACK_FULL, load_video
+from fpack_generate_video import decode_latent
+from frame_pack import hunyuan
+from frame_pack.clip_vision import hf_clip_vision_encode
+from frame_pack.framepack_utils import load_image_encoders, load_text_encoder1, load_text_encoder2
+from frame_pack.framepack_utils import load_vae as load_framepack_vae
+from frame_pack.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked, load_packed_model
+from frame_pack.k_diffusion_hunyuan import sample_hunyuan
+from frame_pack.utils import crop_or_pad_yield_mask
+from dataset.image_video_dataset import resize_image_to_bucket
+from hv_train_network import NetworkTrainer, load_prompts, clean_memory_on_device, setup_parser_common, read_config_from_file
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+from utils import model_utils
+from utils.safetensors_utils import load_safetensors, MemoryEfficientSafeOpen
+class FramePackNetworkTrainer(NetworkTrainer):
+    def __init__(self):
+        super().__init__()
+    # region model specific
+    @property
+    def architecture(self) -> str:
+        return ARCHITECTURE_FRAMEPACK
+    @property
+    def architecture_full_name(self) -> str:
+        return ARCHITECTURE_FRAMEPACK_FULL
+    def handle_model_specific_args(self, args):
+        self._i2v_training = True
+        self._control_training = False
+        self.default_guidance_scale = 10.0  # embeded guidance scale
+    def process_sample_prompts(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        sample_prompts: str,
+    ):
+        device = accelerator.device
+        logger.info(f"cache Text Encoder outputs for sample prompt: {sample_prompts}")
+        prompts = load_prompts(sample_prompts)
+        # load text encoder
+        tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, device)
+        tokenizer2, text_encoder2 = load_text_encoder2(args)
+        text_encoder2.to(device)
+        sample_prompts_te_outputs = {}  # (prompt) -> (t1 embeds, t1 mask, t2 embeds)
+        for prompt_dict in prompts:
+            for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                if p is None or p in sample_prompts_te_outputs:
+                    continue
+                logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                with torch.amp.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
+                    llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(p, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
+                llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
+                llama_vec = llama_vec.to("cpu")
+                llama_attention_mask = llama_attention_mask.to("cpu")
+                clip_l_pooler = clip_l_pooler.to("cpu")
+                sample_prompts_te_outputs[p] = (llama_vec, llama_attention_mask, clip_l_pooler)
+        del text_encoder1, text_encoder2
+        clean_memory_on_device(device)
+        # image embedding for I2V training
+        feature_extractor, image_encoder = load_image_encoders(args)
+        image_encoder.to(device)
+        # encode image with image encoder
+        sample_prompts_image_embs = {}
+        for prompt_dict in prompts:
+            image_path = prompt_dict.get("image_path", None)
+            assert image_path is not None, "image_path should be set for I2V training"
+            if image_path in sample_prompts_image_embs:
+                continue
+            logger.info(f"Encoding image to image encoder context: {image_path}")
+            height = prompt_dict.get("height", 256)
+            width = prompt_dict.get("width", 256)
+            img = Image.open(image_path).convert("RGB")
+            img_np = np.array(img)  # PIL to numpy, HWC
+            img_np = image_video_dataset.resize_image_to_bucket(img_np, (width, height))  # returns a numpy array
+            with torch.no_grad():
+                image_encoder_output = hf_clip_vision_encode(img_np, feature_extractor, image_encoder)
+            image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
+            image_encoder_last_hidden_state = image_encoder_last_hidden_state.to("cpu")
+            sample_prompts_image_embs[image_path] = image_encoder_last_hidden_state
+        del image_encoder
+        clean_memory_on_device(device)
+        # prepare sample parameters
+        sample_parameters = []
+        for prompt_dict in prompts:
+            prompt_dict_copy = prompt_dict.copy()
+            p = prompt_dict.get("prompt", "")
+            llama_vec, llama_attention_mask, clip_l_pooler = sample_prompts_te_outputs[p]
+            prompt_dict_copy["llama_vec"] = llama_vec
+            prompt_dict_copy["llama_attention_mask"] = llama_attention_mask
+            prompt_dict_copy["clip_l_pooler"] = clip_l_pooler
+            p = prompt_dict.get("negative_prompt", "")
+            llama_vec, llama_attention_mask, clip_l_pooler = sample_prompts_te_outputs[p]
+            prompt_dict_copy["negative_llama_vec"] = llama_vec
+            prompt_dict_copy["negative_llama_attention_mask"] = llama_attention_mask
+            prompt_dict_copy["negative_clip_l_pooler"] = clip_l_pooler
+            p = prompt_dict.get("image_path", None)
+            prompt_dict_copy["image_encoder_last_hidden_state"] = sample_prompts_image_embs[p]
+            sample_parameters.append(prompt_dict_copy)
+        clean_memory_on_device(accelerator.device)
+        return sample_parameters
+    def do_inference(
+        self,
+        accelerator,
+        args,
+        sample_parameter,
+        vae,
+        dit_dtype,
+        transformer,
+        discrete_flow_shift,
+        sample_steps,
+        width,
+        height,
+        frame_count,
+        generator,
+        do_classifier_free_guidance,
+        guidance_scale,
+        cfg_scale,
+        image_path=None,
+        control_video_path=None,
+    ):
+        """architecture dependent inference"""
+        model: HunyuanVideoTransformer3DModelPacked = transformer
+        device = accelerator.device
+        if cfg_scale is None:
+            cfg_scale = 1.0
+        do_classifier_free_guidance = do_classifier_free_guidance and cfg_scale != 1.0
+        # prepare parameters
+        one_frame_mode = args.one_frame
+        if one_frame_mode:
+            one_frame_inference = set()
+            for mode in sample_parameter["one_frame"].split(","):
+                one_frame_inference.add(mode.strip())
+        else:
+            one_frame_inference = None
+        latent_window_size = args.latent_window_size  # default is 9
+        latent_f = (frame_count - 1) // 4 + 1
+        total_latent_sections = math.floor((latent_f - 1) / latent_window_size)
+        if total_latent_sections < 1 and not one_frame_mode:
+            logger.warning(f"Not enough frames for FramePack: {latent_f}, minimum: {latent_window_size*4+1}")
+            return None
+        latent_f = total_latent_sections * latent_window_size + 1
+        actual_frame_count = (latent_f - 1) * 4 + 1
+        if actual_frame_count != frame_count:
+            logger.info(f"Frame count mismatch: {actual_frame_count} != {frame_count}, trimming to {actual_frame_count}")
+        frame_count = actual_frame_count
+        num_frames = latent_window_size * 4 - 3
+        # prepare start and control latent
+        def encode_image(path):
+            image = Image.open(path)
+            if image.mode == "RGBA":
+                alpha = image.split()[-1]
+                image = image.convert("RGB")
+            else:
+                alpha = None
+            image = resize_image_to_bucket(image, (width, height))  # returns a numpy array
+            image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(1).unsqueeze(0).float()  # 1, C, 1, H, W
+            image = image / 127.5 - 1  # -1 to 1
+            return hunyuan.vae_encode(image, vae).to("cpu"), alpha
+        # VAE encoding
+        logger.info(f"Encoding image to latent space")
+        vae.to(device)
+        start_latent, _ = (
+            encode_image(image_path) if image_path else torch.zeros((1, 16, 1, height // 8, width // 8), dtype=torch.float32)
+        )
+        if one_frame_mode:
+            control_latents = []
+            control_alphas = []
+            if "control_image_path" in sample_parameter:
+                for control_image_path in sample_parameter["control_image_path"]:
+                    control_latent, control_alpha = encode_image(control_image_path)
+                    control_latents.append(control_latent)
+                    control_alphas.append(control_alpha)
+        else:
+            control_latents = None
+            control_alphas = None
+        vae.to("cpu")  # move VAE to CPU to save memory
+        clean_memory_on_device(device)
+        # sampilng
+        if not one_frame_mode:
+            f1_mode = args.f1
+            history_latents = torch.zeros((1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32)
+            if not f1_mode:
+                total_generated_latent_frames = 0
+                latent_paddings = reversed(range(total_latent_sections))
+            else:
+                total_generated_latent_frames = 1
+                history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
+                latent_paddings = [0] * total_latent_sections
+            if total_latent_sections > 4:
+                latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
+            latent_paddings = list(latent_paddings)
+            for loop_index in range(total_latent_sections):
+                latent_padding = latent_paddings[loop_index]
+                if not f1_mode:
+                    is_last_section = latent_padding == 0
+                    latent_padding_size = latent_padding * latent_window_size
+                    logger.info(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}")
+                    indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
+                    (
+                        clean_latent_indices_pre,
+                        blank_indices,
+                        latent_indices,
+                        clean_latent_indices_post,
+                        clean_latent_2x_indices,
+                        clean_latent_4x_indices,
+                    ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
+                    clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+                    clean_latents_pre = start_latent.to(history_latents)
+                    clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split(
+                        [1, 2, 16], dim=2
+                    )
+                    clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
+                else:
+                    indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
+                    (
+                        clean_latent_indices_start,
+                        clean_latent_4x_indices,
+                        clean_latent_2x_indices,
+                        clean_latent_1x_indices,
+                        latent_indices,
+                    ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
+                    clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
+                    clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]) :, :, :].split(
+                        [16, 2, 1], dim=2
+                    )
+                    clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
+                # if use_teacache:
+                #     transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
+                # else:
+                #     transformer.initialize_teacache(enable_teacache=False)
+                llama_vec = sample_parameter["llama_vec"].to(device, dtype=torch.bfloat16)
+                llama_attention_mask = sample_parameter["llama_attention_mask"].to(device)
+                clip_l_pooler = sample_parameter["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+                if cfg_scale == 1.0:
+                    llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
+                    llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
+                else:
+                    llama_vec_n = sample_parameter["negative_llama_vec"].to(device, dtype=torch.bfloat16)
+                    llama_attention_mask_n = sample_parameter["negative_llama_attention_mask"].to(device)
+                    clip_l_pooler_n = sample_parameter["negative_clip_l_pooler"].to(device, dtype=torch.bfloat16)
+                image_encoder_last_hidden_state = sample_parameter["image_encoder_last_hidden_state"].to(
+                    device, dtype=torch.bfloat16
+                )
+                generated_latents = sample_hunyuan(
+                    transformer=model,
+                    sampler=args.sample_solver,
+                    width=width,
+                    height=height,
+                    frames=num_frames,
+                    real_guidance_scale=cfg_scale,
+                    distilled_guidance_scale=guidance_scale,
+                    guidance_rescale=0.0,
+                    # shift=3.0,
+                    num_inference_steps=sample_steps,
+                    generator=generator,
+                    prompt_embeds=llama_vec,
+                    prompt_embeds_mask=llama_attention_mask,
+                    prompt_poolers=clip_l_pooler,
+                    negative_prompt_embeds=llama_vec_n,
+                    negative_prompt_embeds_mask=llama_attention_mask_n,
+                    negative_prompt_poolers=clip_l_pooler_n,
+                    device=device,
+                    dtype=torch.bfloat16,
+                    image_embeddings=image_encoder_last_hidden_state,
+                    latent_indices=latent_indices,
+                    clean_latents=clean_latents,
+                    clean_latent_indices=clean_latent_indices,
+                    clean_latents_2x=clean_latents_2x,
+                    clean_latent_2x_indices=clean_latent_2x_indices,
+                    clean_latents_4x=clean_latents_4x,
+                    clean_latent_4x_indices=clean_latent_4x_indices,
+                )
+                total_generated_latent_frames += int(generated_latents.shape[2])
+                if not f1_mode:
+                    if is_last_section:
+                        generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
+                        total_generated_latent_frames += 1
+                    history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
+                    real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
+                else:
+                    history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
+                    real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
+                logger.info(f"Generated. Latent shape {real_history_latents.shape}")
+        else:
+            # one frame mode
+            sample_num_frames = 1
+            latent_indices = torch.zeros((1, 1), dtype=torch.int64)  # 1x1 latent index for target image
+            latent_indices[:, 0] = latent_window_size  # last of latent_window
+            def get_latent_mask(mask_image: Image.Image):
+                mask_image = mask_image.resize((width // 8, height // 8), Image.LANCZOS)
+                mask_image = np.array(mask_image)  # PIL to numpy, HWC
+                mask_image = torch.from_numpy(mask_image).float() / 255.0  # 0 to 1.0, HWC
+                mask_image = mask_image.squeeze(-1)  # HWC -> HW
+                mask_image = mask_image.unsqueeze(0).unsqueeze(0).unsqueeze(0)  # HW -> 111HW (B, C, F, H, W)
+                mask_image = mask_image.to(torch.float32)
+                return mask_image
+            if control_latents is None or len(control_latents) == 0:
+                logger.info(f"No control images provided for one frame inference. Use zero latents for control images.")
+                control_latents = [torch.zeros(1, 16, 1, height // 8, width // 8, dtype=torch.float32)]
+            if "no_post" not in one_frame_inference:
+                # add zero latents as clean latents post
+                control_latents.append(torch.zeros((1, 16, 1, height // 8, width // 8), dtype=torch.float32))
+                logger.info(f"Add zero latents as clean latents post for one frame inference.")
+            # kisekaeichi and 1f-mc: both are using control images, but indices are different
+            clean_latents = torch.cat(control_latents, dim=2)  # (1, 16, num_control_images, H//8, W//8)
+            clean_latent_indices = torch.zeros((1, len(control_latents)), dtype=torch.int64)
+            if "no_post" not in one_frame_inference:
+                clean_latent_indices[:, -1] = 1 + latent_window_size  # default index for clean latents post
+            # apply mask for control latents (clean latents)
+            for i in range(len(control_alphas)):
+                control_alpha = control_alphas[i]
+                if control_alpha is not None:
+                    latent_mask = get_latent_mask(control_alpha)
+                    logger.info(
+                        f"Apply mask for clean latents 1x for {i+1}: shape: {latent_mask.shape}"
+                    )
+                    clean_latents[:, :, i : i + 1, :, :] = clean_latents[:, :, i : i + 1, :, :] * latent_mask
+            for one_frame_param in one_frame_inference:
+                if one_frame_param.startswith("target_index="):
+                    target_index = int(one_frame_param.split("=")[1])
+                    latent_indices[:, 0] = target_index
+                    logger.info(f"Set index for target: {target_index}")
+                elif one_frame_param.startswith("control_index="):
+                    control_indices = one_frame_param.split("=")[1].split(";")
+                    i = 0
+                    while i < len(control_indices) and i < clean_latent_indices.shape[1]:
+                        control_index = int(control_indices[i])
+                        clean_latent_indices[:, i] = control_index
+                        i += 1
+                    logger.info(f"Set index for clean latent 1x: {control_indices}")
+            if "no_2x" in one_frame_inference:
+                clean_latents_2x = None
+                clean_latent_2x_indices = None
+                logger.info(f"No clean_latents_2x")
+            else:
+                clean_latents_2x = torch.zeros((1, 16, 2, height // 8, width // 8), dtype=torch.float32)
+                index = 1 + latent_window_size + 1
+                clean_latent_2x_indices = torch.arange(index, index + 2)  #  2
+            if "no_4x" in one_frame_inference:
+                clean_latents_4x = None
+                clean_latent_4x_indices = None
+                logger.info(f"No clean_latents_4x")
+            else:
+                index = 1 + latent_window_size + 1 + 2
+                clean_latent_4x_indices = torch.arange(index, index + 16)  #  16
+            logger.info(
+                f"One frame inference. clean_latent: {clean_latents.shape} latent_indices: {latent_indices}, clean_latent_indices: {clean_latent_indices}, num_frames: {sample_num_frames}"
+            )
+            # prepare conditioning inputs
+            llama_vec = sample_parameter["llama_vec"].to(device, dtype=torch.bfloat16)
+            llama_attention_mask = sample_parameter["llama_attention_mask"].to(device)
+            clip_l_pooler = sample_parameter["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+            if cfg_scale == 1.0:
+                llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
+                llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
+            else:
+                llama_vec_n = sample_parameter["negative_llama_vec"].to(device, dtype=torch.bfloat16)
+                llama_attention_mask_n = sample_parameter["negative_llama_attention_mask"].to(device)
+                clip_l_pooler_n = sample_parameter["negative_clip_l_pooler"].to(device, dtype=torch.bfloat16)
+            image_encoder_last_hidden_state = sample_parameter["image_encoder_last_hidden_state"].to(
+                device, dtype=torch.bfloat16
+            )
+            generated_latents = sample_hunyuan(
+                transformer=model,
+                sampler=args.sample_solver,
+                width=width,
+                height=height,
+                frames=1,
+                real_guidance_scale=cfg_scale,
+                distilled_guidance_scale=guidance_scale,
+                guidance_rescale=0.0,
+                # shift=3.0,
+                num_inference_steps=sample_steps,
+                generator=generator,
+                prompt_embeds=llama_vec,
+                prompt_embeds_mask=llama_attention_mask,
+                prompt_poolers=clip_l_pooler,
+                negative_prompt_embeds=llama_vec_n,
+                negative_prompt_embeds_mask=llama_attention_mask_n,
+                negative_prompt_poolers=clip_l_pooler_n,
+                device=device,
+                dtype=torch.bfloat16,
+                image_embeddings=image_encoder_last_hidden_state,
+                latent_indices=latent_indices,
+                clean_latents=clean_latents,
+                clean_latent_indices=clean_latent_indices,
+                clean_latents_2x=clean_latents_2x,
+                clean_latent_2x_indices=clean_latent_2x_indices,
+                clean_latents_4x=clean_latents_4x,
+                clean_latent_4x_indices=clean_latent_4x_indices,
+            )
+            real_history_latents = generated_latents.to(clean_latents)
+        # wait for 5 seconds until block swap is done
+        logger.info("Waiting for 5 seconds to finish block swap")
+        time.sleep(5)
+        gc.collect()
+        clean_memory_on_device(device)
+        video = decode_latent(
+            latent_window_size, total_latent_sections, args.bulk_decode, vae, real_history_latents, device, one_frame_mode
+        )
+        video = video.to("cpu", dtype=torch.float32).unsqueeze(0)  # add batch dimension
+        video = (video / 2 + 0.5).clamp(0, 1)  # -1 to 1 -> 0 to 1
+        clean_memory_on_device(device)
+        return video
+    def load_vae(self, args: argparse.Namespace, vae_dtype: torch.dtype, vae_path: str):
+        vae_path = args.vae
+        logger.info(f"Loading VAE model from {vae_path}")
+        vae = load_framepack_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, "cpu")
+        return vae
+    def load_transformer(
+        self,
+        accelerator: Accelerator,
+        args: argparse.Namespace,
+        dit_path: str,
+        attn_mode: str,
+        split_attn: bool,
+        loading_device: str,
+        dit_weight_dtype: Optional[torch.dtype],
+    ):
+        logger.info(f"Loading DiT model from {dit_path}")
+        device = accelerator.device
+        model = load_packed_model(device, dit_path, attn_mode, loading_device, args.fp8_scaled, split_attn)
+        return model
+    def scale_shift_latents(self, latents):
+        # FramePack VAE includes scaling
+        return latents
+    def call_dit(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        transformer,
+        latents: torch.Tensor,
+        batch: dict[str, torch.Tensor],
+        noise: torch.Tensor,
+        noisy_model_input: torch.Tensor,
+        timesteps: torch.Tensor,
+        network_dtype: torch.dtype,
+    ):
+        model: HunyuanVideoTransformer3DModelPacked = transformer
+        device = accelerator.device
+        batch_size = latents.shape[0]
+        # maybe model.dtype is better than network_dtype...
+        distilled_guidance = torch.tensor([args.guidance_scale * 1000.0] * batch_size).to(device=device, dtype=network_dtype)
+        latents = latents.to(device=accelerator.device, dtype=network_dtype)
+        noisy_model_input = noisy_model_input.to(device=accelerator.device, dtype=network_dtype)
+        # for k, v in batch.items():
+        #     if isinstance(v, torch.Tensor):
+        #         print(f"{k}: {v.shape} {v.dtype} {v.device}")
+        with accelerator.autocast():
+            clean_latent_2x_indices = batch["clean_latent_2x_indices"] if "clean_latent_2x_indices" in batch else None
+            if clean_latent_2x_indices is not None:
+                clean_latent_2x = batch["latents_clean_2x"] if "latents_clean_2x" in batch else None
+                if clean_latent_2x is None:
+                    clean_latent_2x = torch.zeros(
+                        (batch_size, 16, 2, latents.shape[3], latents.shape[4]), dtype=latents.dtype, device=latents.device
+                    )
+            else:
+                clean_latent_2x = None
+            clean_latent_4x_indices = batch["clean_latent_4x_indices"] if "clean_latent_4x_indices" in batch else None
+            if clean_latent_4x_indices is not None:
+                clean_latent_4x = batch["latents_clean_4x"] if "latents_clean_4x" in batch else None
+                if clean_latent_4x is None:
+                    clean_latent_4x = torch.zeros(
+                        (batch_size, 16, 16, latents.shape[3], latents.shape[4]), dtype=latents.dtype, device=latents.device
+                    )
+            else:
+                clean_latent_4x = None
+            model_pred = model(
+                hidden_states=noisy_model_input,
+                timestep=timesteps,
+                encoder_hidden_states=batch["llama_vec"],
+                encoder_attention_mask=batch["llama_attention_mask"],
+                pooled_projections=batch["clip_l_pooler"],
+                guidance=distilled_guidance,
+                latent_indices=batch["latent_indices"],
+                clean_latents=batch["latents_clean"],
+                clean_latent_indices=batch["clean_latent_indices"],
+                clean_latents_2x=clean_latent_2x,
+                clean_latent_2x_indices=clean_latent_2x_indices,
+                clean_latents_4x=clean_latent_4x,
+                clean_latent_4x_indices=clean_latent_4x_indices,
+                image_embeddings=batch["image_embeddings"],
+                return_dict=False,
+            )
+            model_pred = model_pred[0]  # returns tuple (model_pred, )
+        # flow matching loss
+        target = noise - latents
+        return model_pred, target
+    # endregion model specific
+def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    """FramePack specific parser setup"""
+    parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT / DiTにスケーリングされたfp8を使う")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for LLM / LLMにfp8を使う")
+    parser.add_argument("--text_encoder1", type=str, help="Text Encoder 1 directory / テキストエンコーダ1のディレクトリ")
+    parser.add_argument("--text_encoder2", type=str, help="Text Encoder 2 directory / テキストエンコーダ2のディレクトリ")
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    parser.add_argument("--image_encoder", type=str, required=True, help="Image encoder (CLIP) checkpoint path or directory")
+    parser.add_argument("--latent_window_size", type=int, default=9, help="FramePack latent window size (default 9)")
+    parser.add_argument("--bulk_decode", action="store_true", help="decode all frames at once in sample generation")
+    parser.add_argument("--f1", action="store_true", help="Use F1 sampling method for sample generation")
+    parser.add_argument("--one_frame", action="store_true", help="Use one frame sampling method for sample generation")
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser_common()
+    parser = framepack_setup_parser(parser)
+    args = parser.parse_args()
+    args = read_config_from_file(args, parser)
+    assert (
+        args.vae_dtype is None or args.vae_dtype == "float16"
+    ), "VAE dtype must be float16 / VAEのdtypeはfloat16でなければなりません"
+    args.vae_dtype = "float16"  # fixed
+    args.dit_dtype = "bfloat16"  # fixed
+    args.sample_solver = "unipc"  # for sample generation, fixed to unipc
+    trainer = FramePackNetworkTrainer()
+    trainer.train(args)

hv_train.py ADDED Viewed

	@@ -0,0 +1,1721 @@

+import ast
+import asyncio
+from datetime import timedelta
+import gc
+import importlib
+import argparse
+import math
+import os
+import pathlib
+import re
+import sys
+import random
+import time
+import json
+from multiprocessing import Value
+from typing import Any, Dict, List, Optional
+import accelerate
+import numpy as np
+from packaging.version import Version
+import huggingface_hub
+import toml
+import torch
+from tqdm import tqdm
+from accelerate.utils import set_seed
+from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs
+from safetensors.torch import load_file, save_file
+import transformers
+from diffusers.optimization import (
+    SchedulerType as DiffusersSchedulerType,
+    TYPE_TO_SCHEDULER_FUNCTION as DIFFUSERS_TYPE_TO_SCHEDULER_FUNCTION,
+)
+from transformers.optimization import SchedulerType, TYPE_TO_SCHEDULER_FUNCTION
+from dataset import config_utils
+from hunyuan_model.models import load_transformer, get_rotary_pos_embed_by_shape
+import hunyuan_model.text_encoder as text_encoder_module
+from hunyuan_model.vae import load_vae
+import hunyuan_model.vae as vae_module
+from modules.scheduling_flow_match_discrete import FlowMatchDiscreteScheduler
+import networks.lora as lora_module
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from dataset.image_video_dataset import ARCHITECTURE_HUNYUAN_VIDEO
+import logging
+from utils import huggingface_utils, model_utils, train_utils, sai_model_spec
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+BASE_MODEL_VERSION_HUNYUAN_VIDEO = "hunyuan_video"
+# TODO make separate file for some functions to commonize with other scripts
+def clean_memory_on_device(device: torch.device):
+    r"""
+    Clean memory on the specified device, will be called from training scripts.
+    """
+    gc.collect()
+    # device may "cuda" or "cuda:0", so we need to check the type of device
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    if device.type == "xpu":
+        torch.xpu.empty_cache()
+    if device.type == "mps":
+        torch.mps.empty_cache()
+# for collate_fn: epoch and step is multiprocessing.Value
+class collator_class:
+    def __init__(self, epoch, step, dataset):
+        self.current_epoch = epoch
+        self.current_step = step
+        self.dataset = dataset  # not used if worker_info is not None, in case of multiprocessing
+    def __call__(self, examples):
+        worker_info = torch.utils.data.get_worker_info()
+        # worker_info is None in the main process
+        if worker_info is not None:
+            dataset = worker_info.dataset
+        else:
+            dataset = self.dataset
+        # set epoch and step
+        dataset.set_current_epoch(self.current_epoch.value)
+        dataset.set_current_step(self.current_step.value)
+        return examples[0]
+def prepare_accelerator(args: argparse.Namespace) -> Accelerator:
+    """
+    DeepSpeed is not supported in this script currently.
+    """
+    if args.logging_dir is None:
+        logging_dir = None
+    else:
+        log_prefix = "" if args.log_prefix is None else args.log_prefix
+        logging_dir = args.logging_dir + "/" + log_prefix + time.strftime("%Y%m%d%H%M%S", time.localtime())
+    if args.log_with is None:
+        if logging_dir is not None:
+            log_with = "tensorboard"
+        else:
+            log_with = None
+    else:
+        log_with = args.log_with
+        if log_with in ["tensorboard", "all"]:
+            if logging_dir is None:
+                raise ValueError(
+                    "logging_dir is required when log_with is tensorboard / Tensorboardを使う場合、logging_dirを指定してください"
+                )
+        if log_with in ["wandb", "all"]:
+            try:
+                import wandb
+            except ImportError:
+                raise ImportError("No wandb / wandb がインストールされていないようです")
+            if logging_dir is not None:
+                os.makedirs(logging_dir, exist_ok=True)
+                os.environ["WANDB_DIR"] = logging_dir
+            if args.wandb_api_key is not None:
+                wandb.login(key=args.wandb_api_key)
+    kwargs_handlers = [
+        (
+            InitProcessGroupKwargs(
+                backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
+                init_method=(
+                    "env://?use_libuv=False" if os.name == "nt" and Version(torch.__version__) >= Version("2.4.0") else None
+                ),
+                timeout=timedelta(minutes=args.ddp_timeout) if args.ddp_timeout else None,
+            )
+            if torch.cuda.device_count() > 1
+            else None
+        ),
+        (
+            DistributedDataParallelKwargs(
+                gradient_as_bucket_view=args.ddp_gradient_as_bucket_view, static_graph=args.ddp_static_graph
+            )
+            if args.ddp_gradient_as_bucket_view or args.ddp_static_graph
+            else None
+        ),
+    ]
+    kwargs_handlers = [i for i in kwargs_handlers if i is not None]
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=log_with,
+        project_dir=logging_dir,
+        kwargs_handlers=kwargs_handlers,
+    )
+    print("accelerator device:", accelerator.device)
+    return accelerator
+def line_to_prompt_dict(line: str) -> dict:
+    # subset of gen_img_diffusers
+    prompt_args = line.split(" --")
+    prompt_dict = {}
+    prompt_dict["prompt"] = prompt_args[0]
+    for parg in prompt_args:
+        try:
+            m = re.match(r"w (\d+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["width"] = int(m.group(1))
+                continue
+            m = re.match(r"h (\d+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["height"] = int(m.group(1))
+                continue
+            m = re.match(r"f (\d+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["frame_count"] = int(m.group(1))
+                continue
+            m = re.match(r"d (\d+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["seed"] = int(m.group(1))
+                continue
+            m = re.match(r"s (\d+)", parg, re.IGNORECASE)
+            if m:  # steps
+                prompt_dict["sample_steps"] = max(1, min(1000, int(m.group(1))))
+                continue
+            # m = re.match(r"l ([\d\.]+)", parg, re.IGNORECASE)
+            # if m:  # scale
+            #     prompt_dict["scale"] = float(m.group(1))
+            #     continue
+            # m = re.match(r"n (.+)", parg, re.IGNORECASE)
+            # if m:  # negative prompt
+            #     prompt_dict["negative_prompt"] = m.group(1)
+            #     continue
+        except ValueError as ex:
+            logger.error(f"Exception in parsing / 解析エラー: {parg}")
+            logger.error(ex)
+    return prompt_dict
+def load_prompts(prompt_file: str) -> list[Dict]:
+    # read prompts
+    if prompt_file.endswith(".txt"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        prompts = [line.strip() for line in lines if len(line.strip()) > 0 and line[0] != "#"]
+    elif prompt_file.endswith(".toml"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            data = toml.load(f)
+        prompts = [dict(**data["prompt"], **subset) for subset in data["prompt"]["subset"]]
+    elif prompt_file.endswith(".json"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            prompts = json.load(f)
+    # preprocess prompts
+    for i in range(len(prompts)):
+        prompt_dict = prompts[i]
+        if isinstance(prompt_dict, str):
+            prompt_dict = line_to_prompt_dict(prompt_dict)
+            prompts[i] = prompt_dict
+        assert isinstance(prompt_dict, dict)
+        # Adds an enumerator to the dict based on prompt position. Used later to name image files. Also cleanup of extra data in original prompt dict.
+        prompt_dict["enum"] = i
+        prompt_dict.pop("subset", None)
+    return prompts
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+):
+    """Compute the density for sampling the timesteps when doing SD3 training.
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "logit_normal":
+        # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.nn.functional.sigmoid(u)
+    elif weighting_scheme == "mode":
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    else:
+        u = torch.rand(size=(batch_size,), device="cpu")
+    return u
+def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+    # if sum([(schedule_timesteps == t) for t in timesteps]) < len(timesteps):
+    if any([(schedule_timesteps == t).sum() == 0 for t in timesteps]):
+        # raise ValueError("Some timesteps are not in the schedule / 一部のtimestepsがスケジュールに含まれていません")
+        # round to nearest timestep
+        logger.warning("Some timesteps are not in the schedule / 一部のtimestepsがスケジュールに含まれていません")
+        step_indices = [torch.argmin(torch.abs(schedule_timesteps - t)).item() for t in timesteps]
+    else:
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+def compute_loss_weighting_for_sd3(weighting_scheme: str, noise_scheduler, timesteps, device, dtype):
+    """Computes loss weighting scheme for SD3 training.
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "sigma_sqrt" or weighting_scheme == "cosmap":
+        sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=5, dtype=dtype)
+        if weighting_scheme == "sigma_sqrt":
+            weighting = (sigmas**-2.0).float()
+        else:
+            bot = 1 - 2 * sigmas + 2 * sigmas**2
+            weighting = 2 / (math.pi * bot)
+    else:
+        weighting = None  # torch.ones_like(sigmas)
+    return weighting
+class FineTuningTrainer:
+    def __init__(self):
+        pass
+    def process_sample_prompts(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        sample_prompts: str,
+        text_encoder1: str,
+        text_encoder2: str,
+        fp8_llm: bool,
+    ):
+        logger.info(f"cache Text Encoder outputs for sample prompt: {sample_prompts}")
+        prompts = load_prompts(sample_prompts)
+        def encode_for_text_encoder(text_encoder, is_llm=True):
+            sample_prompts_te_outputs = {}  # (prompt) -> (embeds, mask)
+            with accelerator.autocast(), torch.no_grad():
+                for prompt_dict in prompts:
+                    for p in [prompt_dict.get("prompt", "")]:
+                        if p not in sample_prompts_te_outputs:
+                            logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                            data_type = "video"
+                            text_inputs = text_encoder.text2tokens(p, data_type=data_type)
+                            prompt_outputs = text_encoder.encode(text_inputs, data_type=data_type)
+                            sample_prompts_te_outputs[p] = (prompt_outputs.hidden_state, prompt_outputs.attention_mask)
+            return sample_prompts_te_outputs
+        # Load Text Encoder 1 and encode
+        text_encoder_dtype = torch.float16 if args.text_encoder_dtype is None else model_utils.str_to_dtype(args.text_encoder_dtype)
+        logger.info(f"loading text encoder 1: {text_encoder1}")
+        text_encoder_1 = text_encoder_module.load_text_encoder_1(text_encoder1, accelerator.device, fp8_llm, text_encoder_dtype)
+        logger.info("encoding with Text Encoder 1")
+        te_outputs_1 = encode_for_text_encoder(text_encoder_1)
+        del text_encoder_1
+        # Load Text Encoder 2 and encode
+        logger.info(f"loading text encoder 2: {text_encoder2}")
+        text_encoder_2 = text_encoder_module.load_text_encoder_2(text_encoder2, accelerator.device, text_encoder_dtype)
+        logger.info("encoding with Text Encoder 2")
+        te_outputs_2 = encode_for_text_encoder(text_encoder_2, is_llm=False)
+        del text_encoder_2
+        # prepare sample parameters
+        sample_parameters = []
+        for prompt_dict in prompts:
+            prompt_dict_copy = prompt_dict.copy()
+            p = prompt_dict.get("prompt", "")
+            prompt_dict_copy["llm_embeds"] = te_outputs_1[p][0]
+            prompt_dict_copy["llm_mask"] = te_outputs_1[p][1]
+            prompt_dict_copy["clipL_embeds"] = te_outputs_2[p][0]
+            prompt_dict_copy["clipL_mask"] = te_outputs_2[p][1]
+            sample_parameters.append(prompt_dict_copy)
+        clean_memory_on_device(accelerator.device)
+        return sample_parameters
+    def get_optimizer(self, args, trainable_params: list[torch.nn.Parameter]) -> tuple[str, str, torch.optim.Optimizer]:
+        # adamw, adamw8bit, adafactor
+        optimizer_type = args.optimizer_type.lower()
+        # split optimizer_type and optimizer_args
+        optimizer_kwargs = {}
+        if args.optimizer_args is not None and len(args.optimizer_args) > 0:
+            for arg in args.optimizer_args:
+                key, value = arg.split("=")
+                value = ast.literal_eval(value)
+                optimizer_kwargs[key] = value
+        lr = args.learning_rate
+        optimizer = None
+        optimizer_class = None
+        if optimizer_type.endswith("8bit".lower()):
+            try:
+                import bitsandbytes as bnb
+            except ImportError:
+                raise ImportError("No bitsandbytes / bitsandbytesがインストールされていないようです")
+            if optimizer_type == "AdamW8bit".lower():
+                logger.info(f"use 8-bit AdamW optimizer | {optimizer_kwargs}")
+                optimizer_class = bnb.optim.AdamW8bit
+                optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+        elif optimizer_type == "Adafactor".lower():
+            # Adafactor: check relative_step and warmup_init
+            if "relative_step" not in optimizer_kwargs:
+                optimizer_kwargs["relative_step"] = True  # default
+            if not optimizer_kwargs["relative_step"] and optimizer_kwargs.get("warmup_init", False):
+                logger.info(
+                    f"set relative_step to True because warmup_init is True / warmup_initがTrueのためrelative_stepをTrueにします"
+                )
+                optimizer_kwargs["relative_step"] = True
+            logger.info(f"use Adafactor optimizer | {optimizer_kwargs}")
+            if optimizer_kwargs["relative_step"]:
+                logger.info(f"relative_step is true / relative_stepがtrueです")
+                if lr != 0.0:
+                    logger.warning(f"learning rate is used as initial_lr / 指定したlearning rateはinitial_lrとして使用されます")
+                args.learning_rate = None
+                if args.lr_scheduler != "adafactor":
+                    logger.info(f"use adafactor_scheduler / スケジューラにadafactor_schedulerを使用します")
+                args.lr_scheduler = f"adafactor:{lr}"  # ちょっと微妙だけど
+                lr = None
+            else:
+                if args.max_grad_norm != 0.0:
+                    logger.warning(
+                        f"because max_grad_norm is set, clip_grad_norm is enabled. consider set to 0 / max_grad_normが設定されているためclip_grad_normが有効になります。0に設定して無効にしたほうがいいかもしれません"
+                    )
+                if args.lr_scheduler != "constant_with_warmup":
+                    logger.warning(f"constant_with_warmup will be good / スケジューラはconstant_with_warmupが良いかもしれません")
+                if optimizer_kwargs.get("clip_threshold", 1.0) != 1.0:
+                    logger.warning(f"clip_threshold=1.0 will be good / clip_thresholdは1.0が良いかもしれません")
+            optimizer_class = transformers.optimization.Adafactor
+            optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+        elif optimizer_type == "AdamW".lower():
+            logger.info(f"use AdamW optimizer | {optimizer_kwargs}")
+            optimizer_class = torch.optim.AdamW
+            optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+        if optimizer is None:
+            # 任意のoptimizerを使う
+            case_sensitive_optimizer_type = args.optimizer_type  # not lower
+            logger.info(f"use {case_sensitive_optimizer_type} | {optimizer_kwargs}")
+            if "." not in case_sensitive_optimizer_type:  # from torch.optim
+                optimizer_module = torch.optim
+            else:  # from other library
+                values = case_sensitive_optimizer_type.split(".")
+                optimizer_module = importlib.import_module(".".join(values[:-1]))
+                case_sensitive_optimizer_type = values[-1]
+            optimizer_class = getattr(optimizer_module, case_sensitive_optimizer_type)
+            optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+        # for logging
+        optimizer_name = optimizer_class.__module__ + "." + optimizer_class.__name__
+        optimizer_args = ",".join([f"{k}={v}" for k, v in optimizer_kwargs.items()])
+        # get train and eval functions
+        if hasattr(optimizer, "train") and callable(optimizer.train):
+            train_fn = optimizer.train
+            eval_fn = optimizer.eval
+        else:
+            train_fn = lambda: None
+            eval_fn = lambda: None
+        return optimizer_name, optimizer_args, optimizer, train_fn, eval_fn
+    def is_schedulefree_optimizer(self, optimizer: torch.optim.Optimizer, args: argparse.Namespace) -> bool:
+        return args.optimizer_type.lower().endswith("schedulefree".lower())  # or args.optimizer_schedulefree_wrapper
+    def get_dummy_scheduler(optimizer: torch.optim.Optimizer) -> Any:
+        # dummy scheduler for schedulefree optimizer. supports only empty step(), get_last_lr() and optimizers.
+        # this scheduler is used for logging only.
+        # this isn't be wrapped by accelerator because of this class is not a subclass of torch.optim.lr_scheduler._LRScheduler
+        class DummyScheduler:
+            def __init__(self, optimizer: torch.optim.Optimizer):
+                self.optimizer = optimizer
+            def step(self):
+                pass
+            def get_last_lr(self):
+                return [group["lr"] for group in self.optimizer.param_groups]
+        return DummyScheduler(optimizer)
+    def get_scheduler(self, args, optimizer: torch.optim.Optimizer, num_processes: int):
+        """
+        Unified API to get any scheduler from its name.
+        """
+        # if schedulefree optimizer, return dummy scheduler
+        if self.is_schedulefree_optimizer(optimizer, args):
+            return self.get_dummy_scheduler(optimizer)
+        name = args.lr_scheduler
+        num_training_steps = args.max_train_steps * num_processes  # * args.gradient_accumulation_steps
+        num_warmup_steps: Optional[int] = (
+            int(args.lr_warmup_steps * num_training_steps) if isinstance(args.lr_warmup_steps, float) else args.lr_warmup_steps
+        )
+        num_decay_steps: Optional[int] = (
+            int(args.lr_decay_steps * num_training_steps) if isinstance(args.lr_decay_steps, float) else args.lr_decay_steps
+        )
+        num_stable_steps = num_training_steps - num_warmup_steps - num_decay_steps
+        num_cycles = args.lr_scheduler_num_cycles
+        power = args.lr_scheduler_power
+        timescale = args.lr_scheduler_timescale
+        min_lr_ratio = args.lr_scheduler_min_lr_ratio
+        lr_scheduler_kwargs = {}  # get custom lr_scheduler kwargs
+        if args.lr_scheduler_args is not None and len(args.lr_scheduler_args) > 0:
+            for arg in args.lr_scheduler_args:
+                key, value = arg.split("=")
+                value = ast.literal_eval(value)
+                lr_scheduler_kwargs[key] = value
+        def wrap_check_needless_num_warmup_steps(return_vals):
+            if num_warmup_steps is not None and num_warmup_steps != 0:
+                raise ValueError(f"{name} does not require `num_warmup_steps`. Set None or 0.")
+            return return_vals
+        # using any lr_scheduler from other library
+        if args.lr_scheduler_type:
+            lr_scheduler_type = args.lr_scheduler_type
+            logger.info(f"use {lr_scheduler_type} | {lr_scheduler_kwargs} as lr_scheduler")
+            if "." not in lr_scheduler_type:  # default to use torch.optim
+                lr_scheduler_module = torch.optim.lr_scheduler
+            else:
+                values = lr_scheduler_type.split(".")
+                lr_scheduler_module = importlib.import_module(".".join(values[:-1]))
+                lr_scheduler_type = values[-1]
+            lr_scheduler_class = getattr(lr_scheduler_module, lr_scheduler_type)
+            lr_scheduler = lr_scheduler_class(optimizer, **lr_scheduler_kwargs)
+            return lr_scheduler
+        if name.startswith("adafactor"):
+            assert (
+                type(optimizer) == transformers.optimization.Adafactor
+            ), f"adafactor scheduler must be used with Adafactor optimizer / adafactor schedulerはAdafactorオプティマイザと同時に使ってください"
+            initial_lr = float(name.split(":")[1])
+            # logger.info(f"adafactor scheduler init lr {initial_lr}")
+            return wrap_check_needless_num_warmup_steps(transformers.optimization.AdafactorSchedule(optimizer, initial_lr))
+        if name == DiffusersSchedulerType.PIECEWISE_CONSTANT.value:
+            name = DiffusersSchedulerType(name)
+            schedule_func = DIFFUSERS_TYPE_TO_SCHEDULER_FUNCTION[name]
+            return schedule_func(optimizer, **lr_scheduler_kwargs)  # step_rules and last_epoch are given as kwargs
+        name = SchedulerType(name)
+        schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+        if name == SchedulerType.CONSTANT:
+            return wrap_check_needless_num_warmup_steps(schedule_func(optimizer, **lr_scheduler_kwargs))
+        # All other schedulers require `num_warmup_steps`
+        if num_warmup_steps is None:
+            raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+        if name == SchedulerType.CONSTANT_WITH_WARMUP:
+            return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, **lr_scheduler_kwargs)
+        if name == SchedulerType.INVERSE_SQRT:
+            return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, timescale=timescale, **lr_scheduler_kwargs)
+        # All other schedulers require `num_training_steps`
+        if num_training_steps is None:
+            raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+        if name == SchedulerType.COSINE_WITH_RESTARTS:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+                num_cycles=num_cycles,
+                **lr_scheduler_kwargs,
+            )
+        if name == SchedulerType.POLYNOMIAL:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+                power=power,
+                **lr_scheduler_kwargs,
+            )
+        if name == SchedulerType.COSINE_WITH_MIN_LR:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+                num_cycles=num_cycles / 2,
+                min_lr_rate=min_lr_ratio,
+                **lr_scheduler_kwargs,
+            )
+        # these schedulers do not require `num_decay_steps`
+        if name == SchedulerType.LINEAR or name == SchedulerType.COSINE:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+                **lr_scheduler_kwargs,
+            )
+        # All other schedulers require `num_decay_steps`
+        if num_decay_steps is None:
+            raise ValueError(f"{name} requires `num_decay_steps`, please provide that argument.")
+        if name == SchedulerType.WARMUP_STABLE_DECAY:
+            return schedule_func(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_stable_steps=num_stable_steps,
+                num_decay_steps=num_decay_steps,
+                num_cycles=num_cycles / 2,
+                min_lr_ratio=min_lr_ratio if min_lr_ratio is not None else 0.0,
+                **lr_scheduler_kwargs,
+            )
+        return schedule_func(
+            optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+            num_decay_steps=num_decay_steps,
+            **lr_scheduler_kwargs,
+        )
+    def resume_from_local_or_hf_if_specified(self, accelerator: Accelerator, args: argparse.Namespace) -> bool:
+        if not args.resume:
+            return False
+        if not args.resume_from_huggingface:
+            logger.info(f"resume training from local state: {args.resume}")
+            accelerator.load_state(args.resume)
+            return True
+        logger.info(f"resume training from huggingface state: {args.resume}")
+        repo_id = args.resume.split("/")[0] + "/" + args.resume.split("/")[1]
+        path_in_repo = "/".join(args.resume.split("/")[2:])
+        revision = None
+        repo_type = None
+        if ":" in path_in_repo:
+            divided = path_in_repo.split(":")
+            if len(divided) == 2:
+                path_in_repo, revision = divided
+                repo_type = "model"
+            else:
+                path_in_repo, revision, repo_type = divided
+        logger.info(f"Downloading state from huggingface: {repo_id}/{path_in_repo}@{revision}")
+        list_files = huggingface_utils.list_dir(
+            repo_id=repo_id,
+            subfolder=path_in_repo,
+            revision=revision,
+            token=args.huggingface_token,
+            repo_type=repo_type,
+        )
+        async def download(filename) -> str:
+            def task():
+                return huggingface_hub.hf_hub_download(
+                    repo_id=repo_id,
+                    filename=filename,
+                    revision=revision,
+                    repo_type=repo_type,
+                    token=args.huggingface_token,
+                )
+            return await asyncio.get_event_loop().run_in_executor(None, task)
+        loop = asyncio.get_event_loop()
+        results = loop.run_until_complete(asyncio.gather(*[download(filename=filename.rfilename) for filename in list_files]))
+        if len(results) == 0:
+            raise ValueError(
+                "No files found in the specified repo id/path/revision / 指定されたリポジトリID/パス/リビジョンにファイルが見つかりませんでした"
+            )
+        dirname = os.path.dirname(results[0])
+        accelerator.load_state(dirname)
+        return True
+    def sample_images(self, accelerator, args, epoch, global_step, device, vae, transformer, sample_parameters):
+        pass
+    def get_noisy_model_input_and_timesteps(
+        self,
+        args: argparse.Namespace,
+        noise: torch.Tensor,
+        latents: torch.Tensor,
+        noise_scheduler: FlowMatchDiscreteScheduler,
+        device: torch.device,
+        dtype: torch.dtype,
+    ):
+        batch_size = noise.shape[0]
+        if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid" or args.timestep_sampling == "shift":
+            if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
+                # Simple random t-based noise sampling
+                if args.timestep_sampling == "sigmoid":
+                    t = torch.sigmoid(args.sigmoid_scale * torch.randn((batch_size,), device=device))
+                else:
+                    t = torch.rand((batch_size,), device=device)
+            elif args.timestep_sampling == "shift":
+                shift = args.discrete_flow_shift
+                logits_norm = torch.randn(batch_size, device=device)
+                logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
+                t = logits_norm.sigmoid()
+                t = (t * shift) / (1 + (shift - 1) * t)
+            t_min = args.min_timestep if args.min_timestep is not None else 0
+            t_max = args.max_timestep if args.max_timestep is not None else 1000.0
+            t_min /= 1000.0
+            t_max /= 1000.0
+            t = t * (t_max - t_min) + t_min  # scale to [t_min, t_max], default [0, 1]
+            timesteps = t * 1000.0
+            t = t.view(-1, 1, 1, 1, 1)
+            noisy_model_input = (1 - t) * latents + t * noise
+            timesteps += 1  # 1 to 1000
+        else:
+            # Sample a random timestep for each image
+            # for weighting schemes where we sample timesteps non-uniformly
+            u = compute_density_for_timestep_sampling(
+                weighting_scheme=args.weighting_scheme,
+                batch_size=batch_size,
+                logit_mean=args.logit_mean,
+                logit_std=args.logit_std,
+                mode_scale=args.mode_scale,
+            )
+            # indices = (u * noise_scheduler.config.num_train_timesteps).long()
+            t_min = args.min_timestep if args.min_timestep is not None else 0
+            t_max = args.max_timestep if args.max_timestep is not None else 1000
+            indices = (u * (t_max - t_min) + t_min).long()
+            timesteps = noise_scheduler.timesteps[indices].to(device=device)  # 1 to 1000
+            # Add noise according to flow matching.
+            sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
+            noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+        return noisy_model_input, timesteps
+    def train(self, args):
+        if args.seed is None:
+            args.seed = random.randint(0, 2**32)
+        set_seed(args.seed)
+        # Load dataset config
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+        logger.info(f"Load dataset config from {args.dataset_config}")
+        user_config = config_utils.load_user_config(args.dataset_config)
+        blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_HUNYUAN_VIDEO)
+        train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group, training=True)
+        current_epoch = Value("i", 0)
+        current_step = Value("i", 0)
+        ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+        collator = collator_class(current_epoch, current_step, ds_for_collator)
+        # prepare accelerator
+        logger.info("preparing accelerator")
+        accelerator = prepare_accelerator(args)
+        is_main_process = accelerator.is_main_process
+        # prepare dtype
+        weight_dtype = torch.float32
+        if args.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif args.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+        # HunyuanVideo specific
+        vae_dtype = torch.float16 if args.vae_dtype is None else model_utils.str_to_dtype(args.vae_dtype)
+        # get embedding for sampling images
+        sample_parameters = vae = None
+        if args.sample_prompts:
+            sample_parameters = self.process_sample_prompts(
+                args, accelerator, args.sample_prompts, args.text_encoder1, args.text_encoder2, args.fp8_llm
+            )
+            # Load VAE model for sampling images: VAE is loaded to cpu to save gpu memory
+            vae, _, s_ratio, t_ratio = load_vae(vae_dtype=vae_dtype, device="cpu", vae_path=args.vae)
+            vae.requires_grad_(False)
+            vae.eval()
+            if args.vae_chunk_size is not None:
+                vae.set_chunk_size_for_causal_conv_3d(args.vae_chunk_size)
+                logger.info(f"Set chunk_size to {args.vae_chunk_size} for CausalConv3d in VAE")
+            if args.vae_spatial_tile_sample_min_size is not None:
+                vae.enable_spatial_tiling(True)
+                vae.tile_sample_min_size = args.vae_spatial_tile_sample_min_size
+                vae.tile_latent_min_size = args.vae_spatial_tile_sample_min_size // 8
+            elif args.vae_tiling:
+                vae.enable_spatial_tiling(True)
+        # load DiT model
+        blocks_to_swap = args.blocks_to_swap if args.blocks_to_swap else 0
+        loading_device = "cpu" if blocks_to_swap > 0 else accelerator.device
+        logger.info(f"Loading DiT model from {args.dit}")
+        if args.sdpa:
+            attn_mode = "torch"
+        elif args.flash_attn:
+            attn_mode = "flash"
+        elif args.sage_attn:
+            attn_mode = "sageattn"
+        elif args.xformers:
+            attn_mode = "xformers"
+        else:
+            raise ValueError(
+                f"either --sdpa, --flash-attn, --sage-attn or --xformers must be specified / --sdpa, --flash-attn, --sage-attn, --xformersのいずれかを指定してください"
+            )
+        transformer = load_transformer(
+            args.dit, attn_mode, args.split_attn, loading_device, None, in_channels=args.dit_in_channels
+        )  # load as is
+        if blocks_to_swap > 0:
+            logger.info(f"enable swap {blocks_to_swap} blocks to CPU from device: {accelerator.device}")
+            transformer.enable_block_swap(blocks_to_swap, accelerator.device, supports_backward=True)
+            transformer.move_to_device_except_swap_blocks(accelerator.device)
+        if args.img_in_txt_in_offloading:
+            logger.info("Enable offloading img_in and txt_in to CPU")
+            transformer.enable_img_in_txt_in_offloading()
+        if args.gradient_checkpointing:
+            transformer.enable_gradient_checkpointing()
+        # prepare optimizer, data loader etc.
+        accelerator.print("prepare optimizer, data loader etc.")
+        transformer.requires_grad_(False)
+        if accelerator.is_main_process:
+            accelerator.print(f"Trainable modules '{args.trainable_modules}'.")
+        for name, param in transformer.named_parameters():
+            for trainable_module_name in args.trainable_modules:
+                if trainable_module_name in name:
+                    param.requires_grad = True
+                    break
+        total_params = list(transformer.parameters())
+        trainable_params = list(filter(lambda p: p.requires_grad, transformer.parameters()))
+        logger.info(
+            f"number of trainable parameters: {sum(p.numel() for p in trainable_params) / 1e6} M, total paramters: {sum(p.numel() for p in total_params) / 1e6} M"
+        )
+        optimizer_name, optimizer_args, optimizer, optimizer_train_fn, optimizer_eval_fn = self.get_optimizer(
+            args, trainable_params
+        )
+        # prepare dataloader
+        # num workers for data loader: if 0, persistent_workers is not available
+        n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
+        train_dataloader = torch.utils.data.DataLoader(
+            train_dataset_group,
+            batch_size=1,
+            shuffle=True,
+            collate_fn=collator,
+            num_workers=n_workers,
+            persistent_workers=args.persistent_data_loader_workers,
+        )
+        # calculate max_train_steps
+        if args.max_train_epochs is not None:
+            args.max_train_steps = args.max_train_epochs * math.ceil(
+                len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+            )
+            accelerator.print(
+                f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+            )
+        # send max_train_steps to train_dataset_group
+        train_dataset_group.set_max_train_steps(args.max_train_steps)
+        # prepare lr_scheduler
+        lr_scheduler = self.get_scheduler(args, optimizer, accelerator.num_processes)
+        # prepare training model. accelerator does some magic here
+        # experimental feature: train the model with gradients in fp16/bf16
+        dit_dtype = torch.float32
+        if args.full_fp16:
+            assert (
+                args.mixed_precision == "fp16"
+            ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
+            accelerator.print("enable full fp16 training.")
+            dit_weight_dtype = torch.float16
+        elif args.full_bf16:
+            assert (
+                args.mixed_precision == "bf16"
+            ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
+            accelerator.print("enable full bf16 training.")
+            dit_weight_dtype = torch.bfloat16
+        else:
+            dit_weight_dtype = torch.float32
+        # TODO add fused optimizer and stochastic rounding
+        # cast model to dit_weight_dtype
+        # if dit_dtype != dit_weight_dtype:
+        logger.info(f"casting model to {dit_weight_dtype}")
+        transformer.to(dit_weight_dtype)
+        if blocks_to_swap > 0:
+            transformer = accelerator.prepare(transformer, device_placement=[not blocks_to_swap > 0])
+            accelerator.unwrap_model(transformer).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+            accelerator.unwrap_model(transformer).prepare_block_swap_before_forward()
+        else:
+            transformer = accelerator.prepare(transformer)
+        optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
+        transformer.train()
+        if args.full_fp16:
+            # patch accelerator for fp16 training
+            # def patch_accelerator_for_fp16_training(accelerator):
+            org_unscale_grads = accelerator.scaler._unscale_grads_
+            def _unscale_grads_replacer(optimizer, inv_scale, found_inf, allow_fp16):
+                return org_unscale_grads(optimizer, inv_scale, found_inf, True)
+            accelerator.scaler._unscale_grads_ = _unscale_grads_replacer
+        # resume from local or huggingface. accelerator.step is set
+        self.resume_from_local_or_hf_if_specified(accelerator, args)  # accelerator.load_state(args.resume)
+        # epoch数を計算する
+        num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+        num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+        # 学習���る
+        # total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+        accelerator.print("running training / 学習開始")
+        accelerator.print(f"  num train items / 学習画像、動画数: {train_dataset_group.num_train_items}")
+        accelerator.print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
+        accelerator.print(f"  num epochs / epoch数: {num_train_epochs}")
+        accelerator.print(
+            f"  batch size per device / バッチサイズ: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}"
+        )
+        # accelerator.print(f"  total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ（並列学習、勾配合計含む）: {total_batch_size}")
+        accelerator.print(f"  gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}")
+        accelerator.print(f"  total optimization steps / 学習ステップ数: {args.max_train_steps}")
+        if accelerator.is_main_process:
+            init_kwargs = {}
+            if args.wandb_run_name:
+                init_kwargs["wandb"] = {"name": args.wandb_run_name}
+            if args.log_tracker_config is not None:
+                init_kwargs = toml.load(args.log_tracker_config)
+            accelerator.init_trackers(
+                "hunyuan_video_ft" if args.log_tracker_name is None else args.log_tracker_name,
+                config=train_utils.get_sanitized_config_or_none(args),
+                init_kwargs=init_kwargs,
+            )
+        # TODO skip until initial step
+        progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
+        epoch_to_start = 0
+        global_step = 0
+        noise_scheduler = FlowMatchDiscreteScheduler(shift=args.discrete_flow_shift, reverse=True, solver="euler")
+        loss_recorder = train_utils.LossRecorder()
+        del train_dataset_group
+        # function for saving/removing
+        def save_model(ckpt_name: str, unwrapped_nw, steps, epoch_no, force_sync_upload=False):
+            os.makedirs(args.output_dir, exist_ok=True)
+            ckpt_file = os.path.join(args.output_dir, ckpt_name)
+            accelerator.print(f"\nsaving checkpoint: {ckpt_file}")
+            title = args.metadata_title if args.metadata_title is not None else args.output_name
+            if args.min_timestep is not None or args.max_timestep is not None:
+                min_time_step = args.min_timestep if args.min_timestep is not None else 0
+                max_time_step = args.max_timestep if args.max_timestep is not None else 1000
+                md_timesteps = (min_time_step, max_time_step)
+            else:
+                md_timesteps = None
+            sai_metadata = sai_model_spec.build_metadata(
+                None,
+                ARCHITECTURE_HUNYUAN_VIDEO,
+                time.time(),
+                title,
+                None,
+                args.metadata_author,
+                args.metadata_description,
+                args.metadata_license,
+                args.metadata_tags,
+                timesteps=md_timesteps,
+                is_lora=False,
+            )
+            save_file(unwrapped_nw.state_dict(), ckpt_file, sai_metadata)
+            if args.huggingface_repo_id is not None:
+                huggingface_utils.upload(args, ckpt_file, "/" + ckpt_name, force_sync_upload=force_sync_upload)
+        def remove_model(old_ckpt_name):
+            old_ckpt_file = os.path.join(args.output_dir, old_ckpt_name)
+            if os.path.exists(old_ckpt_file):
+                accelerator.print(f"removing old checkpoint: {old_ckpt_file}")
+                os.remove(old_ckpt_file)
+        # For --sample_at_first
+        optimizer_eval_fn()
+        self.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, transformer, sample_parameters)
+        optimizer_train_fn()
+        if len(accelerator.trackers) > 0:
+            # log empty object to commit the sample images to wandb
+            accelerator.log({}, step=0)
+        # training loop
+        # log device and dtype for each model
+        logger.info(f"DiT dtype: {transformer.dtype}, device: {transformer.device}")
+        clean_memory_on_device(accelerator.device)
+        pos_embed_cache = {}
+        for epoch in range(epoch_to_start, num_train_epochs):
+            accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
+            current_epoch.value = epoch + 1
+            for step, batch in enumerate(train_dataloader):
+                latents, llm_embeds, llm_mask, clip_embeds = batch
+                bsz = latents.shape[0]
+                current_step.value = global_step
+                with accelerator.accumulate(transformer):
+                    latents = latents * vae_module.SCALING_FACTOR
+                    # Sample noise that we'll add to the latents
+                    noise = torch.randn_like(latents)
+                    # calculate model input and timesteps
+                    noisy_model_input, timesteps = self.get_noisy_model_input_and_timesteps(
+                        args, noise, latents, noise_scheduler, accelerator.device, dit_dtype
+                    )
+                    weighting = compute_loss_weighting_for_sd3(
+                        args.weighting_scheme, noise_scheduler, timesteps, accelerator.device, dit_dtype
+                    )
+                    # ensure guidance_scale in args is float
+                    guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)  # , dtype=dit_dtype)
+                    # ensure the hidden state will require grad
+                    if args.gradient_checkpointing:
+                        noisy_model_input.requires_grad_(True)
+                        guidance_vec.requires_grad_(True)
+                    pos_emb_shape = latents.shape[1:]
+                    if pos_emb_shape not in pos_embed_cache:
+                        freqs_cos, freqs_sin = get_rotary_pos_embed_by_shape(
+                            accelerator.unwrap_model(transformer), latents.shape[2:]
+                        )
+                        # freqs_cos = freqs_cos.to(device=accelerator.device, dtype=dit_dtype)
+                        # freqs_sin = freqs_sin.to(device=accelerator.device, dtype=dit_dtype)
+                        pos_embed_cache[pos_emb_shape] = (freqs_cos, freqs_sin)
+                    else:
+                        freqs_cos, freqs_sin = pos_embed_cache[pos_emb_shape]
+                    # call DiT
+                    latents = latents.to(device=accelerator.device, dtype=dit_dtype)
+                    noisy_model_input = noisy_model_input.to(device=accelerator.device, dtype=dit_dtype)
+                    # timesteps = timesteps.to(device=accelerator.device, dtype=dit_dtype)
+                    # llm_embeds = llm_embeds.to(device=accelerator.device, dtype=dit_dtype)
+                    # llm_mask = llm_mask.to(device=accelerator.device)
+                    # clip_embeds = clip_embeds.to(device=accelerator.device, dtype=dit_dtype)
+                    with accelerator.autocast():
+                        model_pred = transformer(
+                            noisy_model_input,
+                            timesteps,
+                            text_states=llm_embeds,
+                            text_mask=llm_mask,
+                            text_states_2=clip_embeds,
+                            freqs_cos=freqs_cos,
+                            freqs_sin=freqs_sin,
+                            guidance=guidance_vec,
+                            return_dict=False,
+                        )
+                    # flow matching loss
+                    target = noise - latents
+                    loss = torch.nn.functional.mse_loss(model_pred.to(dit_dtype), target, reduction="none")
+                    if weighting is not None:
+                        loss = loss * weighting
+                    # loss = loss.mean([1, 2, 3])
+                    # # min snr gamma, scale v pred loss like noise pred, v pred like loss, debiased estimation etc.
+                    # loss = self.post_process_loss(loss, args, timesteps, noise_scheduler)
+                    loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
+                    accelerator.backward(loss)
+                    if accelerator.sync_gradients:
+                        # self.all_reduce_network(accelerator, network)  # sync DDP grad manually
+                        state = accelerate.PartialState()
+                        if state.distributed_type != accelerate.DistributedType.NO:
+                            for param in transformer.parameters():
+                                if param.grad is not None:
+                                    param.grad = accelerator.reduce(param.grad, reduction="mean")
+                        if args.max_grad_norm != 0.0:
+                            params_to_clip = accelerator.unwrap_model(transformer).parameters()
+                            accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad(set_to_none=True)
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    progress_bar.update(1)
+                    global_step += 1
+                    optimizer_eval_fn()
+                    self.sample_images(
+                        accelerator, args, None, global_step, accelerator.device, vae, transformer, sample_parameters
+                    )
+                    # 指定ステップごとにモデルを保存
+                    if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
+                        accelerator.wait_for_everyone()
+                        if accelerator.is_main_process:
+                            ckpt_name = train_utils.get_step_ckpt_name(args.output_name, global_step)
+                            save_model(ckpt_name, accelerator.unwrap_model(transformer), global_step, epoch)
+                            if args.save_state:
+                                train_utils.save_and_remove_state_stepwise(args, accelerator, global_step)
+                            remove_step_no = train_utils.get_remove_step_no(args, global_step)
+                            if remove_step_no is not None:
+                                remove_ckpt_name = train_utils.get_step_ckpt_name(args.output_name, remove_step_no)
+                                remove_model(remove_ckpt_name)
+                    optimizer_train_fn()
+                current_loss = loss.detach().item()
+                loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
+                avr_loss: float = loss_recorder.moving_average
+                logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
+                progress_bar.set_postfix(**logs)
+                if len(accelerator.trackers) > 0:
+                    logs = {"loss": current_loss, "lr": lr_scheduler.get_last_lr()[0]}
+                    accelerator.log(logs, step=global_step)
+                if global_step >= args.max_train_steps:
+                    break
+            if len(accelerator.trackers) > 0:
+                logs = {"loss/epoch": loss_recorder.moving_average}
+                accelerator.log(logs, step=epoch + 1)
+            accelerator.wait_for_everyone()
+            # 指定エポックごとにモデルを保存
+            optimizer_eval_fn()
+            if args.save_every_n_epochs is not None:
+                saving = (epoch + 1) % args.save_every_n_epochs == 0 and (epoch + 1) < num_train_epochs
+                if is_main_process and saving:
+                    ckpt_name = train_utils.get_epoch_ckpt_name(args.output_name, epoch + 1)
+                    save_model(ckpt_name, accelerator.unwrap_model(transformer), global_step, epoch + 1)
+                    remove_epoch_no = train_utils.get_remove_epoch_no(args, epoch + 1)
+                    if remove_epoch_no is not None:
+                        remove_ckpt_name = train_utils.get_epoch_ckpt_name(args.output_name, remove_epoch_no)
+                        remove_model(remove_ckpt_name)
+                    if args.save_state:
+                        train_utils.save_and_remove_state_on_epoch_end(args, accelerator, epoch + 1)
+            self.sample_images(accelerator, args, epoch + 1, global_step, accelerator.device, vae, transformer, sample_parameters)
+            optimizer_train_fn()
+            # end of epoch
+        if is_main_process:
+            transformer = accelerator.unwrap_model(transformer)
+        accelerator.end_training()
+        optimizer_eval_fn()
+        if args.save_state or args.save_state_on_train_end:
+            train_utils.save_state_on_train_end(args, accelerator)
+        if is_main_process:
+            ckpt_name = train_utils.get_last_ckpt_name(args.output_name)
+            save_model(ckpt_name, transformer, global_step, num_train_epochs, force_sync_upload=True)
+            logger.info("model saved.")
+def setup_parser() -> argparse.ArgumentParser:
+    def int_or_float(value):
+        if value.endswith("%"):
+            try:
+                return float(value[:-1]) / 100.0
+            except ValueError:
+                raise argparse.ArgumentTypeError(f"Value '{value}' is not a valid percentage")
+        try:
+            float_value = float(value)
+            if float_value >= 1 and float_value.is_integer():
+                return int(value)
+            return float(value)
+        except ValueError:
+            raise argparse.ArgumentTypeError(f"'{value}' is not an int or float")
+    parser = argparse.ArgumentParser()
+    # general settings
+    parser.add_argument(
+        "--config_file",
+        type=str,
+        default=None,
+        help="using .toml instead of args to pass hyperparameter / ハイパーパラメータを引数ではなく.tomlファイルで渡す",
+    )
+    parser.add_argument(
+        "--dataset_config",
+        type=pathlib.Path,
+        default=None,
+        required=True,
+        help="config file for dataset / データセットの設定ファイル",
+    )
+    # training settings
+    parser.add_argument(
+        "--sdpa",
+        action="store_true",
+        help="use sdpa for CrossAttention (requires PyTorch 2.0) / CrossAttentionにsdpaを使う（PyTorch 2.0が必要）",
+    )
+    parser.add_argument(
+        "--flash_attn",
+        action="store_true",
+        help="use FlashAttention for CrossAttention, requires FlashAttention / CrossAttentionにFlashAttentionを使う、FlashAttentionが必要",
+    )
+    parser.add_argument(
+        "--sage_attn",
+        action="store_true",
+        help="use SageAttention. requires SageAttention / SageAttentionを使う。SageAttentionが必要",
+    )
+    parser.add_argument(
+        "--xformers",
+        action="store_true",
+        help="use xformers for CrossAttention, requires xformers / CrossAttentionにxformersを使う、xformersが必要",
+    )
+    parser.add_argument(
+        "--split_attn",
+        action="store_true",
+        help="use split attention for attention calculation (split batch size=1, affects memory usage and speed)"
+        " / attentionを分割して計算する（バッチサイズ=1に分割、メモリ使用量と速度に影響）",
+    )
+    parser.add_argument("--max_train_steps", type=int, default=1600, help="training steps / 学習ステップ数")
+    parser.add_argument(
+        "--max_train_epochs",
+        type=int,
+        default=None,
+        help="training epochs (overrides max_train_steps) / 学習エポック数（max_train_stepsを上書きします）",
+    )
+    parser.add_argument(
+        "--max_data_loader_n_workers",
+        type=int,
+        default=8,
+        help="max num workers for DataLoader (lower is less main RAM usage, faster epoch start and slower data loading) / DataLoaderの最大プロセス数（小さい値ではメインメモリの使用量が減りエポック間の待ち時間が減りますが、データ読み込みは遅くなります）",
+    )
+    parser.add_argument(
+        "--persistent_data_loader_workers",
+        action="store_true",
+        help="persistent DataLoader workers (useful for reduce time gap between epoch, but may use more memory) / DataLoader のワーカーを持続させる (エポック間の時間差を少なくするのに有効だが、より多くのメモリを消費する可能性がある)",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="random seed for training / 学習時の乱数のseed")
+    parser.add_argument(
+        "--gradient_checkpointing", action="store_true", help="enable gradient checkpointing / gradient checkpointingを有効にする"
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass / 学習時に逆伝播をする前に勾配を合計するステップ数",
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help="use mixed precision / 混合精度を使う場合、その精度",
+    )
+    parser.add_argument("--trainable_modules", nargs="+", default=".", help="Enter a list of trainable modules")
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default=None,
+        help="enable logging and output TensorBoard log to this directory / ログ出力を有効にしてこのディレクトリにTensorBoard用のログを出力する",
+    )
+    parser.add_argument(
+        "--log_with",
+        type=str,
+        default=None,
+        choices=["tensorboard", "wandb", "all"],
+        help="what logging tool(s) to use (if 'all', TensorBoard and WandB are both used) / ログ出力に使用するツール (allを指定するとTensorBoardとWandBの両方が使用される)",
+    )
+    parser.add_argument(
+        "--log_prefix", type=str, default=None, help="add prefix for each log directory / ログディレクトリ名の先頭に追加する文字列"
+    )
+    parser.add_argument(
+        "--log_tracker_name",
+        type=str,
+        default=None,
+        help="name of tracker to use for logging, default is script-specific default name / ログ出力に使用するtrackerの名前、省略時はスクリプトごとのデフォルト名",
+    )
+    parser.add_argument(
+        "--wandb_run_name",
+        type=str,
+        default=None,
+        help="The name of the specific wandb session / wandb ログに表示される特定の実行の名前",
+    )
+    parser.add_argument(
+        "--log_tracker_config",
+        type=str,
+        default=None,
+        help="path to tracker config file to use for logging / ログ出力に使用するtrackerの設定ファイルのパス",
+    )
+    parser.add_argument(
+        "--wandb_api_key",
+        type=str,
+        default=None,
+        help="specify WandB API key to log in before starting training (optional). / WandB APIキーを指定して学習開始前にログインする（オプション）",
+    )
+    parser.add_argument("--log_config", action="store_true", help="log training configuration / 学習設定をログに出力する")
+    parser.add_argument(
+        "--ddp_timeout",
+        type=int,
+        default=None,
+        help="DDP timeout (min, None for default of accelerate) / DDPのタイムアウト（分、Noneでaccelerateのデフォルト）",
+    )
+    parser.add_argument(
+        "--ddp_gradient_as_bucket_view",
+        action="store_true",
+        help="enable gradient_as_bucket_view for DDP / DDPでgradient_as_bucket_viewを有効にする",
+    )
+    parser.add_argument(
+        "--ddp_static_graph",
+        action="store_true",
+        help="enable static_graph for DDP / DDPでstatic_graphを有効にする",
+    )
+    parser.add_argument(
+        "--sample_every_n_steps",
+        type=int,
+        default=None,
+        help="generate sample images every N steps / 学習中のモデルで指定ステップごとにサンプル出力する",
+    )
+    parser.add_argument(
+        "--sample_at_first", action="store_true", help="generate sample images before training / 学習前にサンプル出力する"
+    )
+    parser.add_argument(
+        "--sample_every_n_epochs",
+        type=int,
+        default=None,
+        help="generate sample images every N epochs (overwrites n_steps) / 学習中のモデルで指定エポックごとにサンプル出力する（ステップ数指定を上書きします）",
+    )
+    parser.add_argument(
+        "--sample_prompts",
+        type=str,
+        default=None,
+        help="file for prompts to generate sample images / 学習中モデルのサンプル出力用プロンプトのファイル",
+    )
+    # optimizer and lr scheduler settings
+    parser.add_argument(
+        "--optimizer_type",
+        type=str,
+        default="",
+        help="Optimizer to use / オプティマイザの種類: AdamW (default), AdamW8bit, AdaFactor. "
+        "Also, you can use any optimizer by specifying the full path to the class, like 'torch.optim.AdamW', 'bitsandbytes.optim.AdEMAMix8bit' or 'bitsandbytes.optim.PagedAdEMAMix8bit' etc. / ",
+    )
+    parser.add_argument(
+        "--optimizer_args",
+        type=str,
+        default=None,
+        nargs="*",
+        help='additional arguments for optimizer (like "weight_decay=0.01 betas=0.9,0.999 ...") / オプティマイザの追加引数（例： "weight_decay=0.01 betas=0.9,0.999 ..."）',
+    )
+    parser.add_argument("--learning_rate", type=float, default=2.0e-6, help="learning rate / 学習率")
+    parser.add_argument(
+        "--max_grad_norm",
+        default=1.0,
+        type=float,
+        help="Max gradient norm, 0 for no clipping / 勾配正規化の最大norm、0でclippingを行わない",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help="scheduler to use for learning rate / 学習率のスケジューラ: linear, cosine, cosine_with_restarts, polynomial, constant (default), constant_with_warmup, adafactor",
+    )
+    parser.add_argument(
+        "--lr_warmup_steps",
+        type=int_or_float,
+        default=0,
+        help="Int number of steps for the warmup in the lr scheduler (default is 0) or float with ratio of train steps"
+        " / 学習率のスケジューラをウォームアップするステップ数（デフォルト0）、または学習ステップの比率（1未満のfloat値の場合）",
+    )
+    parser.add_argument(
+        "--lr_decay_steps",
+        type=int_or_float,
+        default=0,
+        help="Int number of steps for the decay in the lr scheduler (default is 0) or float (<1) with ratio of train steps"
+        " / 学習率のスケジューラを減衰させるステップ数（デフォルト0）、または学習ステップの比率（1未満のfloat値の場合）",
+    )
+    parser.add_argument(
+        "--lr_scheduler_num_cycles",
+        type=int,
+        default=1,
+        help="Number of restarts for cosine scheduler with restarts / cosine with restartsスケジューラでのリスタート回数",
+    )
+    parser.add_argument(
+        "--lr_scheduler_power",
+        type=float,
+        default=1,
+        help="Polynomial power for polynomial scheduler / polynomialスケジューラでのpolynomial power",
+    )
+    parser.add_argument(
+        "--lr_scheduler_timescale",
+        type=int,
+        default=None,
+        help="Inverse sqrt timescale for inverse sqrt scheduler,defaults to `num_warmup_steps`"
+        + " / 逆平方根スケジューラのタイムスケール、デフォルトは`num_warmup_steps`",
+    )
+    parser.add_argument(
+        "--lr_scheduler_min_lr_ratio",
+        type=float,
+        default=None,
+        help="The minimum learning rate as a ratio of the initial learning rate for cosine with min lr scheduler and warmup decay scheduler"
+        + " / 初期学習率の比率としての最小学習率を指定する、cosine with min lr と warmup decay スケジューラ で有効",
+    )
+    parser.add_argument("--lr_scheduler_type", type=str, default="", help="custom scheduler module / 使用するスケジューラ")
+    parser.add_argument(
+        "--lr_scheduler_args",
+        type=str,
+        default=None,
+        nargs="*",
+        help='additional arguments for scheduler (like "T_max=100") / スケジューラの追加引数（例： "T_max100"）',
+    )
+    # model settings
+    parser.add_argument("--dit", type=str, required=True, help="DiT checkpoint path / DiTのチェックポイントのパス")
+    parser.add_argument("--dit_dtype", type=str, default=None, help="data type for DiT, default is bfloat16")
+    parser.add_argument("--dit_in_channels", type=int, default=16, help="input channels for DiT, default is 16, skyreels I2V is 32")
+    parser.add_argument("--vae", type=str, help="VAE checkpoint path / VAEのチェックポイントのパス")
+    parser.add_argument("--vae_dtype", type=str, default=None, help="data type for VAE, default is float16")
+    parser.add_argument(
+        "--vae_tiling",
+        action="store_true",
+        help="enable spatial tiling for VAE, default is False. If vae_spatial_tile_sample_min_size is set, this is automatically enabled."
+        " / VAEの空間タイリングを有効にする、デフォルトはFalse。vae_spatial_tile_sample_min_sizeが設定されている場合、自動的に有効になります。",
+    )
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    parser.add_argument("--text_encoder1", type=str, help="Text Encoder 1 directory / テキストエンコーダ1のディレクトリ")
+    parser.add_argument("--text_encoder2", type=str, help="Text Encoder 2 directory / テキストエンコーダ2のディレクトリ")
+    parser.add_argument("--text_encoder_dtype", type=str, default=None, help="data type for Text Encoder, default is float16")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for LLM / LLMにfp8を使う")
+    parser.add_argument("--full_fp16", action="store_true", help="fp16 training including gradients / 勾配も含めてfp16で学習する")
+    parser.add_argument("--full_bf16", action="store_true", help="bf16 training including gradients / 勾配も含めてbf16で学習する")
+    parser.add_argument(
+        "--blocks_to_swap",
+        type=int,
+        default=None,
+        help="number of blocks to swap in the model, max XXX / モデル内のブロックの数、最大XXX",
+    )
+    parser.add_argument(
+        "--img_in_txt_in_offloading",
+        action="store_true",
+        help="offload img_in and txt_in to cpu / img_inとtxt_inをCPUにオフロードする",
+    )
+    # parser.add_argument("--flow_shift", type=float, default=7.0, help="Shift factor for flow matching schedulers")
+    parser.add_argument("--guidance_scale", type=float, default=1.0, help="Embeded classifier free guidance scale.")
+    parser.add_argument(
+        "--timestep_sampling",
+        choices=["sigma", "uniform", "sigmoid", "shift"],
+        default="sigma",
+        help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal and shift of sigmoid."
+        " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト。",
+    )
+    parser.add_argument(
+        "--discrete_flow_shift",
+        type=float,
+        default=1.0,
+        help="Discrete flow shift for the Euler Discrete Scheduler, default is 1.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは1.0。",
+    )
+    parser.add_argument(
+        "--sigmoid_scale",
+        type=float,
+        default=1.0,
+        help='Scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid" or "shift"). / sigmoidタイムステップサンプリングの倍率（timestep-samplingが"sigmoid"または"shift"の場合のみ有効）。',
+    )
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="none",
+        choices=["logit_normal", "mode", "cosmap", "sigma_sqrt", "none"],
+        help="weighting scheme for timestep distribution. Default is none"
+        " / タイムステップ分布の重み付けスキーム、デフォルトはnone",
+    )
+    parser.add_argument(
+        "--logit_mean",
+        type=float,
+        default=0.0,
+        help="mean to use when using the `'logit_normal'` weighting scheme / `'logit_normal'`重み付けスキームを使用する場合の平均",
+    )
+    parser.add_argument(
+        "--logit_std",
+        type=float,
+        default=1.0,
+        help="std to use when using the `'logit_normal'` weighting scheme / `'logit_normal'`重み付けスキームを使用する場合のstd",
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme` / モード重み付けスキームのスケール",
+    )
+    parser.add_argument(
+        "--min_timestep",
+        type=int,
+        default=None,
+        help="set minimum time step for training (0~999, default is 0) / 学習時のtime stepの最小値を設定する（0~999で指定、省略時はデフォルト値(0)） ",
+    )
+    parser.add_argument(
+        "--max_timestep",
+        type=int,
+        default=None,
+        help="set maximum time step for training (1~1000, default is 1000) / 学習時のtime stepの最大値を設定する（1~1000で指定、省略時はデフォルト値(1000)）",
+    )
+    # save and load settings
+    parser.add_argument(
+        "--output_dir", type=str, default=None, help="directory to output trained model / 学習後のモデル出力先ディレクトリ"
+    )
+    parser.add_argument(
+        "--output_name",
+        type=str,
+        default=None,
+        required=True,
+        help="base name of trained model file / 学習後のモデルの拡張子を除くファイル名",
+    )
+    parser.add_argument("--resume", type=str, default=None, help="saved state to resume training / 学習再開するモデルのstate")
+    parser.add_argument(
+        "--save_every_n_epochs",
+        type=int,
+        default=None,
+        help="save checkpoint every N epochs / 学習中のモデルを指定エポックごとに保存する",
+    )
+    parser.add_argument(
+        "--save_every_n_steps",
+        type=int,
+        default=None,
+        help="save checkpoint every N steps / 学習中のモデルを指定ステップごとに保存する",
+    )
+    parser.add_argument(
+        "--save_last_n_epochs",
+        type=int,
+        default=None,
+        help="save last N checkpoints when saving every N epochs (remove older checkpoints) / 指定エポックごとにモデルを保存するとき最大Nエポック保存する（古いチェックポイントは削除する）",
+    )
+    parser.add_argument(
+        "--save_last_n_epochs_state",
+        type=int,
+        default=None,
+        help="save last N checkpoints of state (overrides the value of --save_last_n_epochs)/ 最大Nエポックstateを保存する（--save_last_n_epochsの指定を上書きする）",
+    )
+    parser.add_argument(
+        "--save_last_n_steps",
+        type=int,
+        default=None,
+        help="save checkpoints until N steps elapsed (remove older checkpoints if N steps elapsed) / 指定ステップごとにモデルを保存するとき、このステップ数経過するまで保存する（このステップ数経過したら削除する）",
+    )
+    parser.add_argument(
+        "--save_last_n_steps_state",
+        type=int,
+        default=None,
+        help="save states until N steps elapsed (remove older states if N steps elapsed, overrides --save_last_n_steps) / 指定ステップごとにstateを保存するとき、このステップ数経過するまで保存する（このステップ数経過したら削除する。--save_last_n_stepsを上書きする）",
+    )
+    parser.add_argument(
+        "--save_state",
+        action="store_true",
+        help="save training state additionally (including optimizer states etc.) when saving model / optimizerなど学習状態も含めたstateをモデル保存時に追加で保存する",
+    )
+    parser.add_argument(
+        "--save_state_on_train_end",
+        action="store_true",
+        help="save training state (including optimizer states etc.) on train end even if --save_state is not specified"
+        " / --save_stateが未指定時にもoptimizerなど学習状態も含めたstateを学習終了時に保存する",
+    )
+    # SAI Model spec
+    parser.add_argument(
+        "--metadata_title",
+        type=str,
+        default=None,
+        help="title for model metadata (default is output_name) / メタデータに書き込まれるモデルタイトル、省略時はoutput_name",
+    )
+    parser.add_argument(
+        "--metadata_author",
+        type=str,
+        default=None,
+        help="author name for model metadata / メタデータに書き込まれるモデル作者名",
+    )
+    parser.add_argument(
+        "--metadata_description",
+        type=str,
+        default=None,
+        help="description for model metadata / メタデータに書き込まれるモデル説明",
+    )
+    parser.add_argument(
+        "--metadata_license",
+        type=str,
+        default=None,
+        help="license for model metadata / メタデータに書き込まれるモデルライセンス",
+    )
+    parser.add_argument(
+        "--metadata_tags",
+        type=str,
+        default=None,
+        help="tags for model metadata, separated by comma / メタデータに書き込まれるモデルタグ、カンマ区切り",
+    )
+    # huggingface settings
+    parser.add_argument(
+        "--huggingface_repo_id",
+        type=str,
+        default=None,
+        help="huggingface repo name to upload / huggingfaceにアップロードするリポジトリ名",
+    )
+    parser.add_argument(
+        "--huggingface_repo_type",
+        type=str,
+        default=None,
+        help="huggingface repo type to upload / huggingfaceにアップロードするリポジトリの種類",
+    )
+    parser.add_argument(
+        "--huggingface_path_in_repo",
+        type=str,
+        default=None,
+        help="huggingface model path to upload files / huggingfaceにアップロードするファイルのパス",
+    )
+    parser.add_argument("--huggingface_token", type=str, default=None, help="huggingface token / huggingfaceのトークン")
+    parser.add_argument(
+        "--huggingface_repo_visibility",
+        type=str,
+        default=None,
+        help="huggingface repository visibility ('public' for public, 'private' or None for private) / huggingfaceにアップロードするリポジトリの公開設定（'public'で公開、'private'またはNoneで非公開）",
+    )
+    parser.add_argument(
+        "--save_state_to_huggingface", action="store_true", help="save state to huggingface / huggingfaceにstateを保存する"
+    )
+    parser.add_argument(
+        "--resume_from_huggingface",
+        action="store_true",
+        help="resume from huggingface (ex: --resume {repo_id}/{path_in_repo}:{revision}:{repo_type}) / huggingfaceから学習を再開する(例: --resume {repo_id}/{path_in_repo}:{revision}:{repo_type})",
+    )
+    parser.add_argument(
+        "--async_upload",
+        action="store_true",
+        help="upload to huggingface asynchronously / huggingfaceに非同期でアップロードする",
+    )
+    return parser
+def read_config_from_file(args: argparse.Namespace, parser: argparse.ArgumentParser):
+    if not args.config_file:
+        return args
+    config_path = args.config_file + ".toml" if not args.config_file.endswith(".toml") else args.config_file
+    if not os.path.exists(config_path):
+        logger.info(f"{config_path} not found.")
+        exit(1)
+    logger.info(f"Loading settings from {config_path}...")
+    with open(config_path, "r", encoding="utf-8") as f:
+        config_dict = toml.load(f)
+    # combine all sections into one
+    ignore_nesting_dict = {}
+    for section_name, section_dict in config_dict.items():
+        # if value is not dict, save key and value as is
+        if not isinstance(section_dict, dict):
+            ignore_nesting_dict[section_name] = section_dict
+            continue
+        # if value is dict, save all key and value into one dict
+        for key, value in section_dict.items():
+            ignore_nesting_dict[key] = value
+    config_args = argparse.Namespace(**ignore_nesting_dict)
+    args = parser.parse_args(namespace=config_args)
+    args.config_file = os.path.splitext(args.config_file)[0]
+    logger.info(args.config_file)
+    return args
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    args = read_config_from_file(args, parser)
+    trainer = FineTuningTrainer()
+    trainer.train(args)

hv_train_network.py ADDED Viewed

The diff for this file is too large to render. See raw diff

wan_cache_latents.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import argparse
+import os
+import glob
+from typing import Optional, Union
+import numpy as np
+import torch
+from tqdm import tqdm
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from PIL import Image
+import logging
+from dataset.image_video_dataset import ItemInfo, save_latent_cache_wan, ARCHITECTURE_WAN
+from utils.model_utils import str_to_dtype
+from wan.configs import wan_i2v_14B
+from wan.modules.vae import WanVAE
+from wan.modules.clip import CLIPModel
+import cache_latents
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def encode_and_save_batch(vae: WanVAE, clip: Optional[CLIPModel], batch: list[ItemInfo]):
+    contents = torch.stack([torch.from_numpy(item.content) for item in batch])
+    if len(contents.shape) == 4:
+        contents = contents.unsqueeze(1)  # B, H, W, C -> B, F, H, W, C
+    contents = contents.permute(0, 4, 1, 2, 3).contiguous()  # B, C, F, H, W
+    contents = contents.to(vae.device, dtype=vae.dtype)
+    contents = contents / 127.5 - 1.0  # normalize to [-1, 1]
+    h, w = contents.shape[3], contents.shape[4]
+    if h < 8 or w < 8:
+        item = batch[0]  # other items should have the same size
+        raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
+    # print(f"encode batch: {contents.shape}")
+    with torch.amp.autocast(device_type=vae.device.type, dtype=vae.dtype), torch.no_grad():
+        latent = vae.encode(contents)  # list of Tensor[C, F, H, W]
+    latent = torch.stack(latent, dim=0)  # B, C, F, H, W
+    latent = latent.to(vae.dtype)  # convert to bfloat16, we are not sure if this is correct
+    if clip is not None:
+        # extract first frame of contents
+        images = contents[:, :, 0:1, :, :]  # B, C, F, H, W, non contiguous view is fine
+        with torch.amp.autocast(device_type=clip.device.type, dtype=torch.float16), torch.no_grad():
+            clip_context = clip.visual(images)
+        clip_context = clip_context.to(torch.float16)  # convert to fp16
+        # encode image latent for I2V
+        B, _, _, lat_h, lat_w = latent.shape
+        F = contents.shape[2]
+        # Create mask for the required number of frames
+        msk = torch.ones(1, F, lat_h, lat_w, dtype=vae.dtype, device=vae.device)
+        msk[:, 1:] = 0
+        msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+        msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
+        msk = msk.transpose(1, 2)  # 1, F, 4, H, W -> 1, 4, F, H, W
+        msk = msk.repeat(B, 1, 1, 1, 1)  # B, 4, F, H, W
+        # Zero padding for the required number of frames only
+        padding_frames = F - 1  # The first frame is the input image
+        images_resized = torch.concat([images, torch.zeros(B, 3, padding_frames, h, w, device=vae.device)], dim=2)
+        with torch.amp.autocast(device_type=vae.device.type, dtype=vae.dtype), torch.no_grad():
+            y = vae.encode(images_resized)
+        y = torch.stack(y, dim=0)  # B, C, F, H, W
+        y = y[:, :, :F]  # may be not needed
+        y = y.to(vae.dtype)  # convert to bfloat16
+        y = torch.concat([msk, y], dim=1)  # B, 4 + C, F, H, W
+    else:
+        clip_context = None
+        y = None
+    # control videos
+    if batch[0].control_content is not None:
+        control_contents = torch.stack([torch.from_numpy(item.control_content) for item in batch])
+        if len(control_contents.shape) == 4:
+            control_contents = control_contents.unsqueeze(1)
+        control_contents = control_contents.permute(0, 4, 1, 2, 3).contiguous()  # B, C, F, H, W
+        control_contents = control_contents.to(vae.device, dtype=vae.dtype)
+        control_contents = control_contents / 127.5 - 1.0  # normalize to [-1, 1]
+        with torch.amp.autocast(device_type=vae.device.type, dtype=vae.dtype), torch.no_grad():
+            control_latent = vae.encode(control_contents)  # list of Tensor[C, F, H, W]
+        control_latent = torch.stack(control_latent, dim=0)  # B, C, F, H, W
+        control_latent = control_latent.to(vae.dtype)  # convert to bfloat16
+    else:
+        control_latent = None
+    # # debug: decode and save
+    # with torch.no_grad():
+    #     latent_to_decode = latent / vae.config.scaling_factor
+    #     images = vae.decode(latent_to_decode, return_dict=False)[0]
+    #     images = (images / 2 + 0.5).clamp(0, 1)
+    #     images = images.cpu().float().numpy()
+    #     images = (images * 255).astype(np.uint8)
+    #     images = images.transpose(0, 2, 3, 4, 1)  # B, C, F, H, W -> B, F, H, W, C
+    #     for b in range(images.shape[0]):
+    #         for f in range(images.shape[1]):
+    #             fln = os.path.splitext(os.path.basename(batch[b].item_key))[0]
+    #             img = Image.fromarray(images[b, f])
+    #             img.save(f"./logs/decode_{fln}_{b}_{f:03d}.jpg")
+    for i, item in enumerate(batch):
+        l = latent[i]
+        cctx = clip_context[i] if clip is not None else None
+        y_i = y[i] if clip is not None else None
+        control_latent_i = control_latent[i] if control_latent is not None else None
+        # print(f"save latent cache: {item.latent_cache_path}, latent shape: {l.shape}")
+        save_latent_cache_wan(item, l, cctx, y_i, control_latent_i)
+def main(args):
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_WAN)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    datasets = train_dataset_group.datasets
+    if args.debug_mode is not None:
+        cache_latents.show_datasets(
+            datasets, args.debug_mode, args.console_width, args.console_back, args.console_num_images, fps=16
+        )
+        return
+    assert args.vae is not None, "vae checkpoint is required"
+    vae_path = args.vae
+    logger.info(f"Loading VAE model from {vae_path}")
+    vae_dtype = torch.bfloat16 if args.vae_dtype is None else str_to_dtype(args.vae_dtype)
+    cache_device = torch.device("cpu") if args.vae_cache_cpu else None
+    vae = WanVAE(vae_path=vae_path, device=device, dtype=vae_dtype, cache_device=cache_device)
+    if args.clip is not None:
+        clip_dtype = wan_i2v_14B.i2v_14B["clip_dtype"]
+        clip = CLIPModel(dtype=clip_dtype, device=device, weight_path=args.clip)
+    else:
+        clip = None
+    # Encode images
+    def encode(one_batch: list[ItemInfo]):
+        encode_and_save_batch(vae, clip, one_batch)
+    cache_latents.encode_datasets(datasets, encode, args)
+def wan_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument("--vae_cache_cpu", action="store_true", help="cache features in VAE on CPU")
+    parser.add_argument(
+        "--clip",
+        type=str,
+        default=None,
+        help="text encoder (CLIP) checkpoint path, optional. If training I2V model, this is required",
+    )
+    return parser
+if __name__ == "__main__":
+    parser = cache_latents.setup_parser_common()
+    parser = wan_setup_parser(parser)
+    args = parser.parse_args()
+    main(args)

wan_cache_text_encoder_outputs.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import argparse
+import os
+from typing import Optional, Union
+import numpy as np
+import torch
+from tqdm import tqdm
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+import accelerate
+from dataset.image_video_dataset import ARCHITECTURE_WAN, ItemInfo, save_text_encoder_output_cache_wan
+# for t5 config: all Wan2.1 models have the same config for t5
+from wan.configs import wan_t2v_14B
+import cache_text_encoder_outputs
+import logging
+from utils.model_utils import str_to_dtype
+from wan.modules.t5 import T5EncoderModel
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def encode_and_save_batch(
+    text_encoder: T5EncoderModel, batch: list[ItemInfo], device: torch.device, accelerator: Optional[accelerate.Accelerator]
+):
+    prompts = [item.caption for item in batch]
+    # print(prompts)
+    # encode prompt
+    with torch.no_grad():
+        if accelerator is not None:
+            with accelerator.autocast():
+                context = text_encoder(prompts, device)
+        else:
+            context = text_encoder(prompts, device)
+    # save prompt cache
+    for item, ctx in zip(batch, context):
+        save_text_encoder_output_cache_wan(item, ctx)
+def main(args):
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_WAN)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    datasets = train_dataset_group.datasets
+    # define accelerator for fp8 inference
+    config = wan_t2v_14B.t2v_14B  # all Wan2.1 models have the same config for t5
+    accelerator = None
+    if args.fp8_t5:
+        accelerator = accelerate.Accelerator(mixed_precision="bf16" if config.t5_dtype == torch.bfloat16 else "fp16")
+    # prepare cache files and paths: all_cache_files_for_dataset = exisiting cache files, all_cache_paths_for_dataset = all cache paths in the dataset
+    all_cache_files_for_dataset, all_cache_paths_for_dataset = cache_text_encoder_outputs.prepare_cache_files_and_paths(datasets)
+    # Load T5
+    logger.info(f"Loading T5: {args.t5}")
+    text_encoder = T5EncoderModel(
+        text_len=config.text_len, dtype=config.t5_dtype, device=device, weight_path=args.t5, fp8=args.fp8_t5
+    )
+    # Encode with T5
+    logger.info("Encoding with T5")
+    def encode_for_text_encoder(batch: list[ItemInfo]):
+        encode_and_save_batch(text_encoder, batch, device, accelerator)
+    cache_text_encoder_outputs.process_text_encoder_batches(
+        args.num_workers,
+        args.skip_existing,
+        args.batch_size,
+        datasets,
+        all_cache_files_for_dataset,
+        all_cache_paths_for_dataset,
+        encode_for_text_encoder,
+    )
+    del text_encoder
+    # remove cache files not in dataset
+    cache_text_encoder_outputs.post_process_cache_files(datasets, all_cache_files_for_dataset, all_cache_paths_for_dataset, args.keep_cache)
+def wan_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument("--t5", type=str, default=None, required=True, help="text encoder (T5) checkpoint path")
+    parser.add_argument("--fp8_t5", action="store_true", help="use fp8 for Text Encoder model")
+    return parser
+if __name__ == "__main__":
+    parser = cache_text_encoder_outputs.setup_parser_common()
+    parser = wan_setup_parser(parser)
+    args = parser.parse_args()
+    main(args)

wan_generate_video.py ADDED Viewed

	@@ -0,0 +1,1902 @@

+import argparse
+from datetime import datetime
+import gc
+import random
+import os
+import re
+import time
+import math
+import copy
+from types import ModuleType, SimpleNamespace
+from typing import Tuple, Optional, List, Union, Any, Dict
+import torch
+import accelerate
+from accelerate import Accelerator
+from safetensors.torch import load_file, save_file
+from safetensors import safe_open
+from PIL import Image
+import cv2
+import numpy as np
+import torchvision.transforms.functional as TF
+from tqdm import tqdm
+from networks import lora_wan
+from utils.safetensors_utils import mem_eff_save_file, load_safetensors
+from wan.configs import WAN_CONFIGS, SUPPORTED_SIZES
+import wan
+from wan.modules.model import WanModel, load_wan_model, detect_wan_sd_dtype
+from wan.modules.vae import WanVAE
+from wan.modules.t5 import T5EncoderModel
+from wan.modules.clip import CLIPModel
+from modules.scheduling_flow_match_discrete import FlowMatchDiscreteScheduler
+from wan.utils.fm_solvers import FlowDPMSolverMultistepScheduler, get_sampling_sigmas, retrieve_timesteps
+from wan.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+try:
+    from lycoris.kohya import create_network_from_weights
+except:
+    pass
+from utils.model_utils import str_to_dtype
+from utils.device_utils import clean_memory_on_device
+from hv_generate_video import save_images_grid, save_videos_grid, synchronize_device
+from dataset.image_video_dataset import load_video
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+class GenerationSettings:
+    def __init__(
+        self, device: torch.device, cfg, dit_dtype: torch.dtype, dit_weight_dtype: Optional[torch.dtype], vae_dtype: torch.dtype
+    ):
+        self.device = device
+        self.cfg = cfg
+        self.dit_dtype = dit_dtype
+        self.dit_weight_dtype = dit_weight_dtype
+        self.vae_dtype = vae_dtype
+def parse_args() -> argparse.Namespace:
+    """parse command line arguments"""
+    parser = argparse.ArgumentParser(description="Wan 2.1 inference script")
+    # WAN arguments
+    parser.add_argument("--ckpt_dir", type=str, default=None, help="The path to the checkpoint directory (Wan 2.1 official).")
+    parser.add_argument("--task", type=str, default="t2v-14B", choices=list(WAN_CONFIGS.keys()), help="The task to run.")
+    parser.add_argument(
+        "--sample_solver", type=str, default="unipc", choices=["unipc", "dpm++", "vanilla"], help="The solver used to sample."
+    )
+    parser.add_argument("--dit", type=str, default=None, help="DiT checkpoint path")
+    parser.add_argument("--vae", type=str, default=None, help="VAE checkpoint path")
+    parser.add_argument("--vae_dtype", type=str, default=None, help="data type for VAE, default is bfloat16")
+    parser.add_argument("--vae_cache_cpu", action="store_true", help="cache features in VAE on CPU")
+    parser.add_argument("--t5", type=str, default=None, help="text encoder (T5) checkpoint path")
+    parser.add_argument("--clip", type=str, default=None, help="text encoder (CLIP) checkpoint path")
+    # LoRA
+    parser.add_argument("--lora_weight", type=str, nargs="*", required=False, default=None, help="LoRA weight path")
+    parser.add_argument("--lora_multiplier", type=float, nargs="*", default=1.0, help="LoRA multiplier")
+    parser.add_argument("--include_patterns", type=str, nargs="*", default=None, help="LoRA module include patterns")
+    parser.add_argument("--exclude_patterns", type=str, nargs="*", default=None, help="LoRA module exclude patterns")
+    parser.add_argument(
+        "--save_merged_model",
+        type=str,
+        default=None,
+        help="Save merged model to path. If specified, no inference will be performed.",
+    )
+    # inference
+    parser.add_argument("--prompt", type=str, default=None, help="prompt for generation")
+    parser.add_argument(
+        "--negative_prompt",
+        type=str,
+        default=None,
+        help="negative prompt for generation, use default negative prompt if not specified",
+    )
+    parser.add_argument("--video_size", type=int, nargs=2, default=[256, 256], help="video size, height and width")
+    parser.add_argument("--video_length", type=int, default=None, help="video length, Default depends on task")
+    parser.add_argument("--fps", type=int, default=16, help="video fps, Default is 16")
+    parser.add_argument("--infer_steps", type=int, default=None, help="number of inference steps")
+    parser.add_argument("--save_path", type=str, required=True, help="path to save generated video")
+    parser.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
+    parser.add_argument(
+        "--cpu_noise", action="store_true", help="Use CPU to generate noise (compatible with ComfyUI). Default is False."
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=5.0,
+        help="Guidance scale for classifier free guidance. Default is 5.0.",
+    )
+    parser.add_argument("--video_path", type=str, default=None, help="path to video for video2video inference")
+    parser.add_argument("--image_path", type=str, default=None, help="path to image for image2video inference")
+    parser.add_argument("--end_image_path", type=str, default=None, help="path to end image for image2video inference")
+    parser.add_argument(
+        "--control_path",
+        type=str,
+        default=None,
+        help="path to control video for inference with controlnet. video file or directory with images",
+    )
+    parser.add_argument("--trim_tail_frames", type=int, default=0, help="trim tail N frames from the video before saving")
+    parser.add_argument(
+        "--cfg_skip_mode",
+        type=str,
+        default="none",
+        choices=["early", "late", "middle", "early_late", "alternate", "none"],
+        help="CFG skip mode. each mode skips different parts of the CFG. "
+        " early: initial steps, late: later steps, middle: middle steps, early_late: both early and late, alternate: alternate, none: no skip (default)",
+    )
+    parser.add_argument(
+        "--cfg_apply_ratio",
+        type=float,
+        default=None,
+        help="The ratio of steps to apply CFG (0.0 to 1.0). Default is None (apply all steps).",
+    )
+    parser.add_argument(
+        "--slg_layers", type=str, default=None, help="Skip block (layer) indices for SLG (Skip Layer Guidance), comma separated"
+    )
+    parser.add_argument(
+        "--slg_scale",
+        type=float,
+        default=3.0,
+        help="scale for SLG classifier free guidance. Default is 3.0. Ignored if slg_mode is None or uncond",
+    )
+    parser.add_argument("--slg_start", type=float, default=0.0, help="start ratio for inference steps for SLG. Default is 0.0.")
+    parser.add_argument("--slg_end", type=float, default=0.3, help="end ratio for inference steps for SLG. Default is 0.3.")
+    parser.add_argument(
+        "--slg_mode",
+        type=str,
+        default=None,
+        choices=["original", "uncond"],
+        help="SLG mode. original: same as SD3, uncond: replace uncond pred with SLG pred",
+    )
+    # Flow Matching
+    parser.add_argument(
+        "--flow_shift",
+        type=float,
+        default=None,
+        help="Shift factor for flow matching schedulers. Default depends on task.",
+    )
+    parser.add_argument("--fp8", action="store_true", help="use fp8 for DiT model")
+    parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT, only for fp8")
+    parser.add_argument("--fp8_fast", action="store_true", help="Enable fast FP8 arithmetic (RTX 4XXX+), only for fp8_scaled")
+    parser.add_argument("--fp8_t5", action="store_true", help="use fp8 for Text Encoder model")
+    parser.add_argument(
+        "--device", type=str, default=None, help="device to use for inference. If None, use CUDA if available, otherwise use CPU"
+    )
+    parser.add_argument(
+        "--attn_mode",
+        type=str,
+        default="torch",
+        choices=["flash", "flash2", "flash3", "torch", "sageattn", "xformers", "sdpa"],
+        help="attention mode",
+    )
+    parser.add_argument("--blocks_to_swap", type=int, default=0, help="number of blocks to swap in the model")
+    parser.add_argument(
+        "--output_type", type=str, default="video", choices=["video", "images", "latent", "both"], help="output type"
+    )
+    parser.add_argument("--no_metadata", action="store_true", help="do not save metadata")
+    parser.add_argument("--latent_path", type=str, nargs="*", default=None, help="path to latent for decode. no inference")
+    parser.add_argument("--lycoris", action="store_true", help="use lycoris for inference")
+    parser.add_argument("--compile", action="store_true", help="Enable torch.compile")
+    parser.add_argument(
+        "--compile_args",
+        nargs=4,
+        metavar=("BACKEND", "MODE", "DYNAMIC", "FULLGRAPH"),
+        default=["inductor", "max-autotune-no-cudagraphs", "False", "False"],
+        help="Torch.compile settings",
+    )
+    # New arguments for batch and interactive modes
+    parser.add_argument("--from_file", type=str, default=None, help="Read prompts from a file")
+    parser.add_argument("--interactive", action="store_true", help="Interactive mode: read prompts from console")
+    args = parser.parse_args()
+    # Validate arguments
+    if args.from_file and args.interactive:
+        raise ValueError("Cannot use both --from_file and --interactive at the same time")
+    if args.prompt is None and not args.from_file and not args.interactive and args.latent_path is None:
+        raise ValueError("Either --prompt, --from_file, --interactive, or --latent_path must be specified")
+    assert (args.latent_path is None or len(args.latent_path) == 0) or (
+        args.output_type == "images" or args.output_type == "video"
+    ), "latent_path is only supported for images or video output"
+    return args
+def parse_prompt_line(line: str) -> Dict[str, Any]:
+    """Parse a prompt line into a dictionary of argument overrides
+    Args:
+        line: Prompt line with options
+    Returns:
+        Dict[str, Any]: Dictionary of argument overrides
+    """
+    # TODO common function with hv_train_network.line_to_prompt_dict
+    parts = line.split(" --")
+    prompt = parts[0].strip()
+    # Create dictionary of overrides
+    overrides = {"prompt": prompt}
+    for part in parts[1:]:
+        if not part.strip():
+            continue
+        option_parts = part.split(" ", 1)
+        option = option_parts[0].strip()
+        value = option_parts[1].strip() if len(option_parts) > 1 else ""
+        # Map options to argument names
+        if option == "w":
+            overrides["video_size_width"] = int(value)
+        elif option == "h":
+            overrides["video_size_height"] = int(value)
+        elif option == "f":
+            overrides["video_length"] = int(value)
+        elif option == "d":
+            overrides["seed"] = int(value)
+        elif option == "s":
+            overrides["infer_steps"] = int(value)
+        elif option == "g" or option == "l":
+            overrides["guidance_scale"] = float(value)
+        elif option == "fs":
+            overrides["flow_shift"] = float(value)
+        elif option == "i":
+            overrides["image_path"] = value
+        elif option == "cn":
+            overrides["control_path"] = value
+        elif option == "n":
+            overrides["negative_prompt"] = value
+    return overrides
+def apply_overrides(args: argparse.Namespace, overrides: Dict[str, Any]) -> argparse.Namespace:
+    """Apply overrides to args
+    Args:
+        args: Original arguments
+        overrides: Dictionary of overrides
+    Returns:
+        argparse.Namespace: New arguments with overrides applied
+    """
+    args_copy = copy.deepcopy(args)
+    for key, value in overrides.items():
+        if key == "video_size_width":
+            args_copy.video_size[1] = value
+        elif key == "video_size_height":
+            args_copy.video_size[0] = value
+        else:
+            setattr(args_copy, key, value)
+    return args_copy
+def get_task_defaults(task: str, size: Optional[Tuple[int, int]] = None) -> Tuple[int, float, int, bool]:
+    """Return default values for each task
+    Args:
+        task: task name (t2v, t2i, i2v etc.)
+        size: size of the video (width, height)
+    Returns:
+        Tuple[int, float, int, bool]: (infer_steps, flow_shift, video_length, needs_clip)
+    """
+    width, height = size if size else (0, 0)
+    if "t2i" in task:
+        return 50, 5.0, 1, False
+    elif "i2v" in task:
+        flow_shift = 3.0 if (width == 832 and height == 480) or (width == 480 and height == 832) else 5.0
+        return 40, flow_shift, 81, True
+    else:  # t2v or default
+        return 50, 5.0, 81, False
+def setup_args(args: argparse.Namespace) -> argparse.Namespace:
+    """Validate and set default values for optional arguments
+    Args:
+        args: command line arguments
+    Returns:
+        argparse.Namespace: updated arguments
+    """
+    # Get default values for the task
+    infer_steps, flow_shift, video_length, _ = get_task_defaults(args.task, tuple(args.video_size))
+    # Apply default values to unset arguments
+    if args.infer_steps is None:
+        args.infer_steps = infer_steps
+    if args.flow_shift is None:
+        args.flow_shift = flow_shift
+    if args.video_length is None:
+        args.video_length = video_length
+    # Force video_length to 1 for t2i tasks
+    if "t2i" in args.task:
+        assert args.video_length == 1, f"video_length should be 1 for task {args.task}"
+    # parse slg_layers
+    if args.slg_layers is not None:
+        args.slg_layers = list(map(int, args.slg_layers.split(",")))
+    return args
+def check_inputs(args: argparse.Namespace) -> Tuple[int, int, int]:
+    """Validate video size and length
+    Args:
+        args: command line arguments
+    Returns:
+        Tuple[int, int, int]: (height, width, video_length)
+    """
+    height = args.video_size[0]
+    width = args.video_size[1]
+    size = f"{width}*{height}"
+    if size not in SUPPORTED_SIZES[args.task]:
+        logger.warning(f"Size {size} is not supported for task {args.task}. Supported sizes are {SUPPORTED_SIZES[args.task]}.")
+    video_length = args.video_length
+    if height % 8 != 0 or width % 8 != 0:
+        raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    return height, width, video_length
+def calculate_dimensions(video_size: Tuple[int, int], video_length: int, config) -> Tuple[Tuple[int, int, int, int], int]:
+    """calculate dimensions for the generation
+    Args:
+        video_size: video frame size (height, width)
+        video_length: number of frames in the video
+        config: model configuration
+    Returns:
+        Tuple[Tuple[int, int, int, int], int]:
+            ((channels, frames, height, width), seq_len)
+    """
+    height, width = video_size
+    frames = video_length
+    # calculate latent space dimensions
+    lat_f = (frames - 1) // config.vae_stride[0] + 1
+    lat_h = height // config.vae_stride[1]
+    lat_w = width // config.vae_stride[2]
+    # calculate sequence length
+    seq_len = math.ceil((lat_h * lat_w) / (config.patch_size[1] * config.patch_size[2]) * lat_f)
+    return ((16, lat_f, lat_h, lat_w), seq_len)
+def load_vae(args: argparse.Namespace, config, device: torch.device, dtype: torch.dtype) -> WanVAE:
+    """load VAE model
+    Args:
+        args: command line arguments
+        config: model configuration
+        device: device to use
+        dtype: data type for the model
+    Returns:
+        WanVAE: loaded VAE model
+    """
+    vae_path = args.vae if args.vae is not None else os.path.join(args.ckpt_dir, config.vae_checkpoint)
+    logger.info(f"Loading VAE model from {vae_path}")
+    cache_device = torch.device("cpu") if args.vae_cache_cpu else None
+    vae = WanVAE(vae_path=vae_path, device=device, dtype=dtype, cache_device=cache_device)
+    return vae
+def load_text_encoder(args: argparse.Namespace, config, device: torch.device) -> T5EncoderModel:
+    """load text encoder (T5) model
+    Args:
+        args: command line arguments
+        config: model configuration
+        device: device to use
+    Returns:
+        T5EncoderModel: loaded text encoder model
+    """
+    checkpoint_path = None if args.ckpt_dir is None else os.path.join(args.ckpt_dir, config.t5_checkpoint)
+    tokenizer_path = None if args.ckpt_dir is None else os.path.join(args.ckpt_dir, config.t5_tokenizer)
+    text_encoder = T5EncoderModel(
+        text_len=config.text_len,
+        dtype=config.t5_dtype,
+        device=device,
+        checkpoint_path=checkpoint_path,
+        tokenizer_path=tokenizer_path,
+        weight_path=args.t5,
+        fp8=args.fp8_t5,
+    )
+    return text_encoder
+def load_clip_model(args: argparse.Namespace, config, device: torch.device) -> CLIPModel:
+    """load CLIP model (for I2V only)
+    Args:
+        args: command line arguments
+        config: model configuration
+        device: device to use
+    Returns:
+        CLIPModel: loaded CLIP model
+    """
+    checkpoint_path = None if args.ckpt_dir is None else os.path.join(args.ckpt_dir, config.clip_checkpoint)
+    tokenizer_path = None if args.ckpt_dir is None else os.path.join(args.ckpt_dir, config.clip_tokenizer)
+    clip = CLIPModel(
+        dtype=config.clip_dtype,
+        device=device,
+        checkpoint_path=checkpoint_path,
+        tokenizer_path=tokenizer_path,
+        weight_path=args.clip,
+    )
+    return clip
+def load_dit_model(
+    args: argparse.Namespace,
+    config,
+    device: torch.device,
+    dit_dtype: torch.dtype,
+    dit_weight_dtype: Optional[torch.dtype] = None,
+    is_i2v: bool = False,
+) -> WanModel:
+    """load DiT model
+    Args:
+        args: command line arguments
+        config: model configuration
+        device: device to use
+        dit_dtype: data type for the model
+        dit_weight_dtype: data type for the model weights. None for as-is
+        is_i2v: I2V mode
+    Returns:
+        WanModel: loaded DiT model
+    """
+    loading_device = "cpu"
+    if args.blocks_to_swap == 0 and args.lora_weight is None and not args.fp8_scaled:
+        loading_device = device
+    loading_weight_dtype = dit_weight_dtype
+    if args.fp8_scaled or args.lora_weight is not None:
+        loading_weight_dtype = dit_dtype  # load as-is
+    # do not fp8 optimize because we will merge LoRA weights
+    model = load_wan_model(config, device, args.dit, args.attn_mode, False, loading_device, loading_weight_dtype, False)
+    return model
+def merge_lora_weights(
+    lora_module: ModuleType,
+    model: torch.nn.Module,
+    args: argparse.Namespace,
+    device: torch.device,
+    converter: Optional[callable] = None,
+) -> None:
+    """merge LoRA weights to the model
+    Args:
+        lora_module: LoRA module, e.g. lora_wan
+        model: DiT model
+        args: command line arguments
+        device: device to use
+        converter: Optional callable to convert weights
+    """
+    if args.lora_weight is None or len(args.lora_weight) == 0:
+        return
+    for i, lora_weight in enumerate(args.lora_weight):
+        if args.lora_multiplier is not None and len(args.lora_multiplier) > i:
+            lora_multiplier = args.lora_multiplier[i]
+        else:
+            lora_multiplier = 1.0
+        logger.info(f"Loading LoRA weights from {lora_weight} with multiplier {lora_multiplier}")
+        weights_sd = load_file(lora_weight)
+        if converter is not None:
+            weights_sd = converter(weights_sd)
+        # apply include/exclude patterns
+        original_key_count = len(weights_sd.keys())
+        if args.include_patterns is not None and len(args.include_patterns) > i:
+            include_pattern = args.include_patterns[i]
+            regex_include = re.compile(include_pattern)
+            weights_sd = {k: v for k, v in weights_sd.items() if regex_include.search(k)}
+            logger.info(f"Filtered keys with include pattern {include_pattern}: {original_key_count} -> {len(weights_sd.keys())}")
+        if args.exclude_patterns is not None and len(args.exclude_patterns) > i:
+            original_key_count_ex = len(weights_sd.keys())
+            exclude_pattern = args.exclude_patterns[i]
+            regex_exclude = re.compile(exclude_pattern)
+            weights_sd = {k: v for k, v in weights_sd.items() if not regex_exclude.search(k)}
+            logger.info(
+                f"Filtered keys with exclude pattern {exclude_pattern}: {original_key_count_ex} -> {len(weights_sd.keys())}"
+            )
+        if len(weights_sd) != original_key_count:
+            remaining_keys = list(set([k.split(".", 1)[0] for k in weights_sd.keys()]))
+            remaining_keys.sort()
+            logger.info(f"Remaining LoRA modules after filtering: {remaining_keys}")
+            if len(weights_sd) == 0:
+                logger.warning(f"No keys left after filtering.")
+        if args.lycoris:
+            lycoris_net, _ = create_network_from_weights(
+                multiplier=lora_multiplier,
+                file=None,
+                weights_sd=weights_sd,
+                unet=model,
+                text_encoder=None,
+                vae=None,
+                for_inference=True,
+            )
+            lycoris_net.merge_to(None, model, weights_sd, dtype=None, device=device)
+        else:
+            network = lora_module.create_arch_network_from_weights(lora_multiplier, weights_sd, unet=model, for_inference=True)
+            network.merge_to(None, model, weights_sd, device=device, non_blocking=True)
+        synchronize_device(device)
+        logger.info("LoRA weights loaded")
+    # save model here before casting to dit_weight_dtype
+    if args.save_merged_model:
+        logger.info(f"Saving merged model to {args.save_merged_model}")
+        mem_eff_save_file(model.state_dict(), args.save_merged_model)  # save_file needs a lot of memory
+        logger.info("Merged model saved")
+def optimize_model(
+    model: WanModel, args: argparse.Namespace, device: torch.device, dit_dtype: torch.dtype, dit_weight_dtype: torch.dtype
+) -> None:
+    """optimize the model (FP8 conversion, device move etc.)
+    Args:
+        model: dit model
+        args: command line arguments
+        device: device to use
+        dit_dtype: dtype for the model
+        dit_weight_dtype: dtype for the model weights
+    """
+    if args.fp8_scaled:
+        # load state dict as-is and optimize to fp8
+        state_dict = model.state_dict()
+        # if no blocks to swap, we can move the weights to GPU after optimization on GPU (omit redundant CPU->GPU copy)
+        move_to_device = args.blocks_to_swap == 0  # if blocks_to_swap > 0, we will keep the model on CPU
+        state_dict = model.fp8_optimization(state_dict, device, move_to_device, use_scaled_mm=args.fp8_fast)
+        info = model.load_state_dict(state_dict, strict=True, assign=True)
+        logger.info(f"Loaded FP8 optimized weights: {info}")
+        if args.blocks_to_swap == 0:
+            model.to(device)  # make sure all parameters are on the right device (e.g. RoPE etc.)
+    else:
+        # simple cast to dit_dtype
+        target_dtype = None  # load as-is (dit_weight_dtype == dtype of the weights in state_dict)
+        target_device = None
+        if dit_weight_dtype is not None:  # in case of args.fp8 and not args.fp8_scaled
+            logger.info(f"Convert model to {dit_weight_dtype}")
+            target_dtype = dit_weight_dtype
+        if args.blocks_to_swap == 0:
+            logger.info(f"Move model to device: {device}")
+            target_device = device
+        model.to(target_device, target_dtype)  # move and cast  at the same time. this reduces redundant copy operations
+    if args.compile:
+        compile_backend, compile_mode, compile_dynamic, compile_fullgraph = args.compile_args
+        logger.info(
+            f"Torch Compiling[Backend: {compile_backend}; Mode: {compile_mode}; Dynamic: {compile_dynamic}; Fullgraph: {compile_fullgraph}]"
+        )
+        torch._dynamo.config.cache_size_limit = 32
+        for i in range(len(model.blocks)):
+            model.blocks[i] = torch.compile(
+                model.blocks[i],
+                backend=compile_backend,
+                mode=compile_mode,
+                dynamic=compile_dynamic.lower() in "true",
+                fullgraph=compile_fullgraph.lower() in "true",
+            )
+    if args.blocks_to_swap > 0:
+        logger.info(f"Enable swap {args.blocks_to_swap} blocks to CPU from device: {device}")
+        model.enable_block_swap(args.blocks_to_swap, device, supports_backward=False)
+        model.move_to_device_except_swap_blocks(device)
+        model.prepare_block_swap_before_forward()
+    else:
+        # make sure the model is on the right device
+        model.to(device)
+    model.eval().requires_grad_(False)
+    clean_memory_on_device(device)
+def prepare_t2v_inputs(
+    args: argparse.Namespace,
+    config,
+    accelerator: Accelerator,
+    device: torch.device,
+    vae: Optional[WanVAE] = None,
+    encoded_context: Optional[Dict] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+    """Prepare inputs for T2V
+    Args:
+        args: command line arguments
+        config: model configuration
+        accelerator: Accelerator instance
+        device: device to use
+        vae: VAE model for control video encoding
+        encoded_context: Pre-encoded text context
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+            (noise, context, context_null, (arg_c, arg_null))
+    """
+    # Prepare inputs for T2V
+    # calculate dimensions and sequence length
+    height, width = args.video_size
+    frames = args.video_length
+    (_, lat_f, lat_h, lat_w), seq_len = calculate_dimensions(args.video_size, args.video_length, config)
+    target_shape = (16, lat_f, lat_h, lat_w)
+    # configure negative prompt
+    n_prompt = args.negative_prompt if args.negative_prompt else config.sample_neg_prompt
+    # set seed
+    seed = args.seed if args.seed is not None else random.randint(0, 2**32 - 1)
+    if not args.cpu_noise:
+        seed_g = torch.Generator(device=device)
+        seed_g.manual_seed(seed)
+    else:
+        # ComfyUI compatible noise
+        seed_g = torch.manual_seed(seed)
+    if encoded_context is None:
+        # load text encoder
+        text_encoder = load_text_encoder(args, config, device)
+        text_encoder.model.to(device)
+        # encode prompt
+        with torch.no_grad():
+            if args.fp8_t5:
+                with torch.amp.autocast(device_type=device.type, dtype=config.t5_dtype):
+                    context = text_encoder([args.prompt], device)
+                    context_null = text_encoder([n_prompt], device)
+            else:
+                context = text_encoder([args.prompt], device)
+                context_null = text_encoder([n_prompt], device)
+        # free text encoder and clean memory
+        del text_encoder
+        clean_memory_on_device(device)
+    else:
+        # Use pre-encoded context
+        context = encoded_context["context"]
+        context_null = encoded_context["context_null"]
+    # Fun-Control: encode control video to latent space
+    if config.is_fun_control:
+        # TODO use same resizing as for image
+        logger.info(f"Encoding control video to latent space")
+        # C, F, H, W
+        control_video = load_control_video(args.control_path, frames, height, width).to(device)
+        vae.to_device(device)
+        with torch.autocast(device_type=device.type, dtype=vae.dtype), torch.no_grad():
+            control_latent = vae.encode([control_video])[0]
+        y = torch.concat([control_latent, torch.zeros_like(control_latent)], dim=0)  # add control video latent
+        vae.to_device("cpu")
+    else:
+        y = None
+    # generate noise
+    noise = torch.randn(target_shape, dtype=torch.float32, generator=seed_g, device=device if not args.cpu_noise else "cpu")
+    noise = noise.to(device)
+    # prepare model input arguments
+    arg_c = {"context": context, "seq_len": seq_len}
+    arg_null = {"context": context_null, "seq_len": seq_len}
+    if y is not None:
+        arg_c["y"] = [y]
+        arg_null["y"] = [y]
+    return noise, context, context_null, (arg_c, arg_null)
+def prepare_i2v_inputs(
+    args: argparse.Namespace,
+    config,
+    accelerator: Accelerator,
+    device: torch.device,
+    vae: WanVAE,
+    encoded_context: Optional[Dict] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+    """Prepare inputs for I2V
+    Args:
+        args: command line arguments
+        config: model configuration
+        accelerator: Accelerator instance
+        device: device to use
+        vae: VAE model, used for image encoding
+        encoded_context: Pre-encoded text context
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+            (noise, context, context_null, y, (arg_c, arg_null))
+    """
+    # get video dimensions
+    height, width = args.video_size
+    frames = args.video_length
+    max_area = width * height
+    # load image
+    img = Image.open(args.image_path).convert("RGB")
+    # convert to numpy
+    img_cv2 = np.array(img)  # PIL to numpy
+    # convert to tensor (-1 to 1)
+    img_tensor = TF.to_tensor(img).sub_(0.5).div_(0.5).to(device)
+    # end frame image
+    if args.end_image_path is not None:
+        end_img = Image.open(args.end_image_path).convert("RGB")
+        end_img_cv2 = np.array(end_img)  # PIL to numpy
+    else:
+        end_img = None
+        end_img_cv2 = None
+    has_end_image = end_img is not None
+    # calculate latent dimensions: keep aspect ratio
+    height, width = img_tensor.shape[1:]
+    aspect_ratio = height / width
+    lat_h = round(np.sqrt(max_area * aspect_ratio) // config.vae_stride[1] // config.patch_size[1] * config.patch_size[1])
+    lat_w = round(np.sqrt(max_area / aspect_ratio) // config.vae_stride[2] // config.patch_size[2] * config.patch_size[2])
+    height = lat_h * config.vae_stride[1]
+    width = lat_w * config.vae_stride[2]
+    lat_f = (frames - 1) // config.vae_stride[0] + 1  # size of latent frames
+    max_seq_len = (lat_f + (1 if has_end_image else 0)) * lat_h * lat_w // (config.patch_size[1] * config.patch_size[2])
+    # set seed
+    seed = args.seed if args.seed is not None else random.randint(0, 2**32 - 1)
+    if not args.cpu_noise:
+        seed_g = torch.Generator(device=device)
+        seed_g.manual_seed(seed)
+    else:
+        # ComfyUI compatible noise
+        seed_g = torch.manual_seed(seed)
+    # generate noise
+    noise = torch.randn(
+        16,
+        lat_f + (1 if has_end_image else 0),
+        lat_h,
+        lat_w,
+        dtype=torch.float32,
+        generator=seed_g,
+        device=device if not args.cpu_noise else "cpu",
+    )
+    noise = noise.to(device)
+    # configure negative prompt
+    n_prompt = args.negative_prompt if args.negative_prompt else config.sample_neg_prompt
+    if encoded_context is None:
+        # load text encoder
+        text_encoder = load_text_encoder(args, config, device)
+        text_encoder.model.to(device)
+        # encode prompt
+        with torch.no_grad():
+            if args.fp8_t5:
+                with torch.amp.autocast(device_type=device.type, dtype=config.t5_dtype):
+                    context = text_encoder([args.prompt], device)
+                    context_null = text_encoder([n_prompt], device)
+            else:
+                context = text_encoder([args.prompt], device)
+                context_null = text_encoder([n_prompt], device)
+        # free text encoder and clean memory
+        del text_encoder
+        clean_memory_on_device(device)
+        # load CLIP model
+        clip = load_clip_model(args, config, device)
+        clip.model.to(device)
+        # encode image to CLIP context
+        logger.info(f"Encoding image to CLIP context")
+        with torch.amp.autocast(device_type=device.type, dtype=torch.float16), torch.no_grad():
+            clip_context = clip.visual([img_tensor[:, None, :, :]])
+        logger.info(f"Encoding complete")
+        # free CLIP model and clean memory
+        del clip
+        clean_memory_on_device(device)
+    else:
+        # Use pre-encoded context
+        context = encoded_context["context"]
+        context_null = encoded_context["context_null"]
+        clip_context = encoded_context["clip_context"]
+    # encode image to latent space with VAE
+    logger.info(f"Encoding image to latent space")
+    vae.to_device(device)
+    # resize image
+    interpolation = cv2.INTER_AREA if height < img_cv2.shape[0] else cv2.INTER_CUBIC
+    img_resized = cv2.resize(img_cv2, (width, height), interpolation=interpolation)
+    img_resized = TF.to_tensor(img_resized).sub_(0.5).div_(0.5).to(device)  # -1 to 1, CHW
+    img_resized = img_resized.unsqueeze(1)  # CFHW
+    if has_end_image:
+        interpolation = cv2.INTER_AREA if height < end_img_cv2.shape[1] else cv2.INTER_CUBIC
+        end_img_resized = cv2.resize(end_img_cv2, (width, height), interpolation=interpolation)
+        end_img_resized = TF.to_tensor(end_img_resized).sub_(0.5).div_(0.5).to(device)  # -1 to 1, CHW
+        end_img_resized = end_img_resized.unsqueeze(1)  # CFHW
+    # create mask for the first frame
+    msk = torch.zeros(4, lat_f + (1 if has_end_image else 0), lat_h, lat_w, device=device)
+    msk[:, 0] = 1
+    if has_end_image:
+        msk[:, -1] = 1
+    # encode image to latent space
+    with accelerator.autocast(), torch.no_grad():
+        # padding to match the required number of frames
+        padding_frames = frames - 1  # the first frame is image
+        img_resized = torch.concat([img_resized, torch.zeros(3, padding_frames, height, width, device=device)], dim=1)
+        y = vae.encode([img_resized])[0]
+        if has_end_image:
+            y_end = vae.encode([end_img_resized])[0]
+            y = torch.concat([y, y_end], dim=1)  # add end frame
+    y = torch.concat([msk, y])
+    logger.info(f"Encoding complete")
+    # Fun-Control: encode control video to latent space
+    if config.is_fun_control:
+        # TODO use same resizing as for image
+        logger.info(f"Encoding control video to latent space")
+        # C, F, H, W
+        control_video = load_control_video(args.control_path, frames + (1 if has_end_image else 0), height, width).to(device)
+        with accelerator.autocast(), torch.no_grad():
+            control_latent = vae.encode([control_video])[0]
+        y = y[msk.shape[0] :]  # remove mask because Fun-Control does not need it
+        if has_end_image:
+            y[:, 1:-1] = 0  # remove image latent except first and last frame. according to WanVideoWrapper, this doesn't work
+        else:
+            y[:, 1:] = 0  # remove image latent except first frame
+        y = torch.concat([control_latent, y], dim=0)  # add control video latent
+    # prepare model input arguments
+    arg_c = {
+        "context": [context[0]],
+        "clip_fea": clip_context,
+        "seq_len": max_seq_len,
+        "y": [y],
+    }
+    arg_null = {
+        "context": context_null,
+        "clip_fea": clip_context,
+        "seq_len": max_seq_len,
+        "y": [y],
+    }
+    vae.to_device("cpu")  # move VAE to CPU to save memory
+    clean_memory_on_device(device)
+    return noise, context, context_null, y, (arg_c, arg_null)
+def load_control_video(control_path: str, frames: int, height: int, width: int) -> torch.Tensor:
+    """load control video to latent space
+    Args:
+        control_path: path to control video
+        frames: number of frames in the video
+        height: height of the video
+        width: width of the video
+    Returns:
+        torch.Tensor: control video latent, CFHW
+    """
+    logger.info(f"Load control video from {control_path}")
+    video = load_video(control_path, 0, frames, bucket_reso=(width, height))  # list of frames
+    if len(video) < frames:
+        raise ValueError(f"Video length is less than {frames}")
+    # video = np.stack(video, axis=0)  # F, H, W, C
+    video = torch.stack([TF.to_tensor(frame).sub_(0.5).div_(0.5) for frame in video], dim=0)  # F, C, H, W, -1 to 1
+    video = video.permute(1, 0, 2, 3)  # C, F, H, W
+    return video
+def setup_scheduler(args: argparse.Namespace, config, device: torch.device) -> Tuple[Any, torch.Tensor]:
+    """setup scheduler for sampling
+    Args:
+        args: command line arguments
+        config: model configuration
+        device: device to use
+    Returns:
+        Tuple[Any, torch.Tensor]: (scheduler, timesteps)
+    """
+    if args.sample_solver == "unipc":
+        scheduler = FlowUniPCMultistepScheduler(num_train_timesteps=config.num_train_timesteps, shift=1, use_dynamic_shifting=False)
+        scheduler.set_timesteps(args.infer_steps, device=device, shift=args.flow_shift)
+        timesteps = scheduler.timesteps
+    elif args.sample_solver == "dpm++":
+        scheduler = FlowDPMSolverMultistepScheduler(
+            num_train_timesteps=config.num_train_timesteps, shift=1, use_dynamic_shifting=False
+        )
+        sampling_sigmas = get_sampling_sigmas(args.infer_steps, args.flow_shift)
+        timesteps, _ = retrieve_timesteps(scheduler, device=device, sigmas=sampling_sigmas)
+    elif args.sample_solver == "vanilla":
+        scheduler = FlowMatchDiscreteScheduler(num_train_timesteps=config.num_train_timesteps, shift=args.flow_shift)
+        scheduler.set_timesteps(args.infer_steps, device=device)
+        timesteps = scheduler.timesteps
+        # FlowMatchDiscreteScheduler does not support generator argument in step method
+        org_step = scheduler.step
+        def step_wrapper(
+            model_output: torch.Tensor,
+            timestep: Union[int, torch.Tensor],
+            sample: torch.Tensor,
+            return_dict: bool = True,
+            generator=None,
+        ):
+            return org_step(model_output, timestep, sample, return_dict=return_dict)
+        scheduler.step = step_wrapper
+    else:
+        raise NotImplementedError("Unsupported solver.")
+    return scheduler, timesteps
+def run_sampling(
+    model: WanModel,
+    noise: torch.Tensor,
+    scheduler: Any,
+    timesteps: torch.Tensor,
+    args: argparse.Namespace,
+    inputs: Tuple[dict, dict],
+    device: torch.device,
+    seed_g: torch.Generator,
+    accelerator: Accelerator,
+    is_i2v: bool = False,
+    use_cpu_offload: bool = True,
+) -> torch.Tensor:
+    """run sampling
+    Args:
+        model: dit model
+        noise: initial noise
+        scheduler: scheduler for sampling
+        timesteps: time steps for sampling
+        args: command line arguments
+        inputs: model input (arg_c, arg_null)
+        device: device to use
+        seed_g: random generator
+        accelerator: Accelerator instance
+        is_i2v: I2V mode (False means T2V mode)
+        use_cpu_offload: Whether to offload tensors to CPU during processing
+    Returns:
+        torch.Tensor: generated latent
+    """
+    arg_c, arg_null = inputs
+    latent = noise
+    latent_storage_device = device if not use_cpu_offload else "cpu"
+    latent = latent.to(latent_storage_device)
+    # cfg skip
+    apply_cfg_array = []
+    num_timesteps = len(timesteps)
+    if args.cfg_skip_mode != "none" and args.cfg_apply_ratio is not None:
+        # Calculate thresholds based on cfg_apply_ratio
+        apply_steps = int(num_timesteps * args.cfg_apply_ratio)
+        if args.cfg_skip_mode == "early":
+            # Skip CFG in early steps, apply in late steps
+            start_index = num_timesteps - apply_steps
+            end_index = num_timesteps
+        elif args.cfg_skip_mode == "late":
+            # Skip CFG in late steps, apply in early steps
+            start_index = 0
+            end_index = apply_steps
+        elif args.cfg_skip_mode == "early_late":
+            # Skip CFG in early and late steps, apply in middle steps
+            start_index = (num_timesteps - apply_steps) // 2
+            end_index = start_index + apply_steps
+        elif args.cfg_skip_mode == "middle":
+            # Skip CFG in middle steps, apply in early and late steps
+            skip_steps = num_timesteps - apply_steps
+            middle_start = (num_timesteps - skip_steps) // 2
+            middle_end = middle_start + skip_steps
+        w = 0.0
+        for step_idx in range(num_timesteps):
+            if args.cfg_skip_mode == "alternate":
+                # accumulate w and apply CFG when w >= 1.0
+                w += args.cfg_apply_ratio
+                apply = w >= 1.0
+                if apply:
+                    w -= 1.0
+            elif args.cfg_skip_mode == "middle":
+                # Skip CFG in early and late steps, apply in middle steps
+                apply = step_idx < middle_start or step_idx >= middle_end
+            else:
+                # Apply CFG on some steps based on ratio
+                apply = step_idx >= start_index and step_idx < end_index
+            apply_cfg_array.append(apply)
+        pattern = ["A" if apply else "S" for apply in apply_cfg_array]
+        pattern = "".join(pattern)
+        logger.info(f"CFG skip mode: {args.cfg_skip_mode}, apply ratio: {args.cfg_apply_ratio}, pattern: {pattern}")
+    else:
+        # Apply CFG on all steps
+        apply_cfg_array = [True] * num_timesteps
+    # SLG original implementation is based on https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py
+    slg_start_step = int(args.slg_start * num_timesteps)
+    slg_end_step = int(args.slg_end * num_timesteps)
+    for i, t in enumerate(tqdm(timesteps)):
+        # latent is on CPU if use_cpu_offload is True
+        latent_model_input = [latent.to(device)]
+        timestep = torch.stack([t]).to(device)
+        with accelerator.autocast(), torch.no_grad():
+            noise_pred_cond = model(latent_model_input, t=timestep, **arg_c)[0].to(latent_storage_device)
+            apply_cfg = apply_cfg_array[i]  # apply CFG or not
+            if apply_cfg:
+                apply_slg = i >= slg_start_step and i < slg_end_step
+                # print(f"Applying SLG: {apply_slg}, i: {i}, slg_start_step: {slg_start_step}, slg_end_step: {slg_end_step}")
+                if args.slg_mode == "original" and apply_slg:
+                    noise_pred_uncond = model(latent_model_input, t=timestep, **arg_null)[0].to(latent_storage_device)
+                    # apply guidance
+                    # SD3 formula: scaled = neg_out + (pos_out - neg_out) * cond_scale
+                    noise_pred = noise_pred_uncond + args.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    # calculate skip layer out
+                    skip_layer_out = model(latent_model_input, t=timestep, skip_block_indices=args.slg_layers, **arg_null)[0].to(
+                        latent_storage_device
+                    )
+                    # apply skip layer guidance
+                    # SD3 formula: scaled = scaled + (pos_out - skip_layer_out) * self.slg
+                    noise_pred = noise_pred + args.slg_scale * (noise_pred_cond - skip_layer_out)
+                elif args.slg_mode == "uncond" and apply_slg:
+                    # noise_pred_uncond is skip layer out
+                    noise_pred_uncond = model(latent_model_input, t=timestep, skip_block_indices=args.slg_layers, **arg_null)[0].to(
+                        latent_storage_device
+                    )
+                    # apply guidance
+                    noise_pred = noise_pred_uncond + args.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                else:
+                    # normal guidance
+                    noise_pred_uncond = model(latent_model_input, t=timestep, **arg_null)[0].to(latent_storage_device)
+                    # apply guidance
+                    noise_pred = noise_pred_uncond + args.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+            else:
+                noise_pred = noise_pred_cond
+            # step
+            latent_input = latent.unsqueeze(0)
+            temp_x0 = scheduler.step(noise_pred.unsqueeze(0), t, latent_input, return_dict=False, generator=seed_g)[0]
+            # update latent
+            latent = temp_x0.squeeze(0)
+    return latent
+def generate(args: argparse.Namespace, gen_settings: GenerationSettings, shared_models: Optional[Dict] = None) -> torch.Tensor:
+    """main function for generation
+    Args:
+        args: command line arguments
+        shared_models: dictionary containing pre-loaded models and encoded data
+    Returns:
+        torch.Tensor: generated latent
+    """
+    device, cfg, dit_dtype, dit_weight_dtype, vae_dtype = (
+        gen_settings.device,
+        gen_settings.cfg,
+        gen_settings.dit_dtype,
+        gen_settings.dit_weight_dtype,
+        gen_settings.vae_dtype,
+    )
+    # prepare accelerator
+    mixed_precision = "bf16" if dit_dtype == torch.bfloat16 else "fp16"
+    accelerator = accelerate.Accelerator(mixed_precision=mixed_precision)
+    # I2V or T2V
+    is_i2v = "i2v" in args.task
+    # prepare seed
+    seed = args.seed if args.seed is not None else random.randint(0, 2**32 - 1)
+    args.seed = seed  # set seed to args for saving
+    # Check if we have shared models
+    if shared_models is not None:
+        # Use shared models and encoded data
+        vae = shared_models.get("vae")
+        model = shared_models.get("model")
+        encoded_context = shared_models.get("encoded_contexts", {}).get(args.prompt)
+        # prepare inputs
+        if is_i2v:
+            # I2V
+            noise, context, context_null, y, inputs = prepare_i2v_inputs(args, cfg, accelerator, device, vae, encoded_context)
+        else:
+            # T2V
+            noise, context, context_null, inputs = prepare_t2v_inputs(args, cfg, accelerator, device, vae, encoded_context)
+    else:
+        # prepare inputs without shared models
+        if is_i2v:
+            # I2V: need text encoder, VAE and CLIP
+            vae = load_vae(args, cfg, device, vae_dtype)
+            noise, context, context_null, y, inputs = prepare_i2v_inputs(args, cfg, accelerator, device, vae)
+            # vae is on CPU after prepare_i2v_inputs
+        else:
+            # T2V: need text encoder
+            vae = None
+            if cfg.is_fun_control:
+                # Fun-Control: need VAE for encoding control video
+                vae = load_vae(args, cfg, device, vae_dtype)
+            noise, context, context_null, inputs = prepare_t2v_inputs(args, cfg, accelerator, device, vae)
+        # load DiT model
+        model = load_dit_model(args, cfg, device, dit_dtype, dit_weight_dtype, is_i2v)
+        # merge LoRA weights
+        if args.lora_weight is not None and len(args.lora_weight) > 0:
+            merge_lora_weights(lora_wan, model, args, device)
+            # if we only want to save the model, we can skip the rest
+            if args.save_merged_model:
+                return None
+        # optimize model: fp8 conversion, block swap etc.
+        optimize_model(model, args, device, dit_dtype, dit_weight_dtype)
+    # setup scheduler
+    scheduler, timesteps = setup_scheduler(args, cfg, device)
+    # set random generator
+    seed_g = torch.Generator(device=device)
+    seed_g.manual_seed(seed)
+    # run sampling
+    latent = run_sampling(model, noise, scheduler, timesteps, args, inputs, device, seed_g, accelerator, is_i2v)
+    # Only clean up shared models if they were created within this function
+    if shared_models is None:
+        # free memory
+        del model
+        del scheduler
+        synchronize_device(device)
+        # wait for 5 seconds until block swap is done
+        logger.info("Waiting for 5 seconds to finish block swap")
+        time.sleep(5)
+        gc.collect()
+        clean_memory_on_device(device)
+        # save VAE model for decoding
+        if vae is None:
+            args._vae = None
+        else:
+            args._vae = vae
+    return latent
+def decode_latent(latent: torch.Tensor, args: argparse.Namespace, cfg) -> torch.Tensor:
+    """decode latent
+    Args:
+        latent: latent tensor
+        args: command line arguments
+        cfg: model configuration
+    Returns:
+        torch.Tensor: decoded video or image
+    """
+    device = torch.device(args.device)
+    # load VAE model or use the one from the generation
+    vae_dtype = str_to_dtype(args.vae_dtype) if args.vae_dtype is not None else torch.bfloat16
+    if hasattr(args, "_vae") and args._vae is not None:
+        vae = args._vae
+    else:
+        vae = load_vae(args, cfg, device, vae_dtype)
+    vae.to_device(device)
+    logger.info(f"Decoding video from latents: {latent.shape}")
+    x0 = latent.to(device)
+    with torch.autocast(device_type=device.type, dtype=vae_dtype), torch.no_grad():
+        videos = vae.decode(x0)
+    # some tail frames may be corrupted when end frame is used, we add an option to remove them
+    if args.trim_tail_frames:
+        videos[0] = videos[0][:, : -args.trim_tail_frames]
+    logger.info(f"Decoding complete")
+    video = videos[0]
+    del videos
+    video = video.to(torch.float32).cpu()
+    return video
+def save_latent(latent: torch.Tensor, args: argparse.Namespace, height: int, width: int) -> str:
+    """Save latent to file
+    Args:
+        latent: latent tensor
+        args: command line arguments
+        height: height of frame
+        width: width of frame
+    Returns:
+        str: Path to saved latent file
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+    seed = args.seed
+    video_length = args.video_length
+    latent_path = f"{save_path}/{time_flag}_{seed}_latent.safetensors"
+    if args.no_metadata:
+        metadata = None
+    else:
+        metadata = {
+            "seeds": f"{seed}",
+            "prompt": f"{args.prompt}",
+            "height": f"{height}",
+            "width": f"{width}",
+            "video_length": f"{video_length}",
+            "infer_steps": f"{args.infer_steps}",
+            "guidance_scale": f"{args.guidance_scale}",
+        }
+        if args.negative_prompt is not None:
+            metadata["negative_prompt"] = f"{args.negative_prompt}"
+    sd = {"latent": latent}
+    save_file(sd, latent_path, metadata=metadata)
+    logger.info(f"Latent saved to: {latent_path}")
+    return latent_path
+def save_video(video: torch.Tensor, args: argparse.Namespace, original_base_name: Optional[str] = None) -> str:
+    """Save video to file
+    Args:
+        video: Video tensor
+        args: command line arguments
+        original_base_name: Original base name (if latents are loaded from files)
+    Returns:
+        str: Path to saved video file
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+    seed = args.seed
+    original_name = "" if original_base_name is None else f"_{original_base_name}"
+    video_path = f"{save_path}/{time_flag}_{seed}{original_name}.mp4"
+    video = video.unsqueeze(0)
+    save_videos_grid(video, video_path, fps=args.fps, rescale=True)
+    logger.info(f"Video saved to: {video_path}")
+    return video_path
+def save_images(sample: torch.Tensor, args: argparse.Namespace, original_base_name: Optional[str] = None) -> str:
+    """Save images to directory
+    Args:
+        sample: Video tensor
+        args: command line arguments
+        original_base_name: Original base name (if latents are loaded from files)
+    Returns:
+        str: Path to saved images directory
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+    seed = args.seed
+    original_name = "" if original_base_name is None else f"_{original_base_name}"
+    image_name = f"{time_flag}_{seed}{original_name}"
+    sample = sample.unsqueeze(0)
+    save_images_grid(sample, save_path, image_name, rescale=True)
+    logger.info(f"Sample images saved to: {save_path}/{image_name}")
+    return f"{save_path}/{image_name}"
+def save_output(
+    latent: torch.Tensor, args: argparse.Namespace, cfg, height: int, width: int, original_base_names: Optional[List[str]] = None
+) -> None:
+    """save output
+    Args:
+        latent: latent tensor
+        args: command line arguments
+        cfg: model configuration
+        height: height of frame
+        width: width of frame
+        original_base_names: original base names (if latents are loaded from files)
+    """
+    if args.output_type == "latent" or args.output_type == "both":
+        # save latent
+        save_latent(latent, args, height, width)
+    if args.output_type == "video" or args.output_type == "both":
+        # save video
+        sample = decode_latent(latent.unsqueeze(0), args, cfg)
+        original_name = "" if original_base_names is None else f"_{original_base_names[0]}"
+        save_video(sample, args, original_name)
+    elif args.output_type == "images":
+        # save images
+        sample = decode_latent(latent.unsqueeze(0), args, cfg)
+        original_name = "" if original_base_names is None else f"_{original_base_names[0]}"
+        save_images(sample, args, original_name)
+def preprocess_prompts_for_batch(prompt_lines: List[str], base_args: argparse.Namespace) -> List[Dict]:
+    """Process multiple prompts for batch mode
+    Args:
+        prompt_lines: List of prompt lines
+        base_args: Base command line arguments
+    Returns:
+        List[Dict]: List of prompt data dictionaries
+    """
+    prompts_data = []
+    for line in prompt_lines:
+        line = line.strip()
+        if not line or line.startswith("#"):  # Skip empty lines and comments
+            continue
+        # Parse prompt line and create override dictionary
+        prompt_data = parse_prompt_line(line)
+        logger.info(f"Parsed prompt data: {prompt_data}")
+        prompts_data.append(prompt_data)
+    return prompts_data
+def process_batch_prompts(prompts_data: List[Dict], args: argparse.Namespace) -> None:
+    """Process multiple prompts with model reuse
+    Args:
+        prompts_data: List of prompt data dictionaries
+        args: Base command line arguments
+    """
+    if not prompts_data:
+        logger.warning("No valid prompts found")
+        return
+    # 1. Load configuration
+    gen_settings = get_generation_settings(args)
+    device, cfg, dit_dtype, dit_weight_dtype, vae_dtype = (
+        gen_settings.device,
+        gen_settings.cfg,
+        gen_settings.dit_dtype,
+        gen_settings.dit_weight_dtype,
+        gen_settings.vae_dtype,
+    )
+    is_i2v = "i2v" in args.task
+    # 2. Encode all prompts
+    logger.info("Loading text encoder to encode all prompts")
+    text_encoder = load_text_encoder(args, cfg, device)
+    text_encoder.model.to(device)
+    encoded_contexts = {}
+    with torch.no_grad():
+        for prompt_data in prompts_data:
+            prompt = prompt_data["prompt"]
+            prompt_args = apply_overrides(args, prompt_data)
+            n_prompt = prompt_data.get(
+                "negative_prompt", prompt_args.negative_prompt if prompt_args.negative_prompt else cfg.sample_neg_prompt
+            )
+            if args.fp8_t5:
+                with torch.amp.autocast(device_type=device.type, dtype=cfg.t5_dtype):
+                    context = text_encoder([prompt], device)
+                    context_null = text_encoder([n_prompt], device)
+            else:
+                context = text_encoder([prompt], device)
+                context_null = text_encoder([n_prompt], device)
+            encoded_contexts[prompt] = {"context": context, "context_null": context_null}
+    # Free text encoder and clean memory
+    del text_encoder
+    clean_memory_on_device(device)
+    # 3. Process I2V additional encodings if needed
+    vae = None
+    if is_i2v:
+        logger.info("Loading VAE and CLIP for I2V preprocessing")
+        vae = load_vae(args, cfg, device, vae_dtype)
+        vae.to_device(device)
+        clip = load_clip_model(args, cfg, device)
+        clip.model.to(device)
+        # Process each image and encode with CLIP
+        for prompt_data in prompts_data:
+            if "image_path" not in prompt_data:
+                continue
+            prompt_args = apply_overrides(args, prompt_data)
+            if not os.path.exists(prompt_args.image_path):
+                logger.warning(f"Image path not found: {prompt_args.image_path}")
+                continue
+            # Load and encode image with CLIP
+            img = Image.open(prompt_args.image_path).convert("RGB")
+            img_tensor = TF.to_tensor(img).sub_(0.5).div_(0.5).to(device)
+            with torch.amp.autocast(device_type=device.type, dtype=torch.float16), torch.no_grad():
+                clip_context = clip.visual([img_tensor[:, None, :, :]])
+            encoded_contexts[prompt_data["prompt"]]["clip_context"] = clip_context
+        # Free CLIP and clean memory
+        del clip
+        clean_memory_on_device(device)
+        # Keep VAE in CPU memory for later use
+        vae.to_device("cpu")
+    elif cfg.is_fun_control:
+        # For Fun-Control, we need VAE but keep it on CPU
+        vae = load_vae(args, cfg, device, vae_dtype)
+        vae.to_device("cpu")
+    # 4. Load DiT model
+    logger.info("Loading DiT model")
+    model = load_dit_model(args, cfg, device, dit_dtype, dit_weight_dtype, is_i2v)
+    # 5. Merge LoRA weights if needed
+    if args.lora_weight is not None and len(args.lora_weight) > 0:
+        merge_lora_weights(lora_wan, model, args, device)
+        if args.save_merged_model:
+            logger.info("Model merged and saved. Exiting.")
+            return
+    # 6. Optimize model
+    optimize_model(model, args, device, dit_dtype, dit_weight_dtype)
+    # Create shared models dict for generate function
+    shared_models = {"vae": vae, "model": model, "encoded_contexts": encoded_contexts}
+    # 7. Generate for each prompt
+    all_latents = []
+    all_prompt_args = []
+    for i, prompt_data in enumerate(prompts_data):
+        logger.info(f"Processing prompt {i+1}/{len(prompts_data)}: {prompt_data['prompt'][:50]}...")
+        # Apply overrides for this prompt
+        prompt_args = apply_overrides(args, prompt_data)
+        # Generate latent
+        latent = generate(prompt_args, gen_settings, shared_models)
+        # Save latent if needed
+        height, width, _ = check_inputs(prompt_args)
+        if prompt_args.output_type == "latent" or prompt_args.output_type == "both":
+            save_latent(latent, prompt_args, height, width)
+        all_latents.append(latent)
+        all_prompt_args.append(prompt_args)
+    # 8. Free DiT model
+    del model
+    clean_memory_on_device(device)
+    synchronize_device(device)
+    # wait for 5 seconds until block swap is done
+    logger.info("Waiting for 5 seconds to finish block swap")
+    time.sleep(5)
+    gc.collect()
+    clean_memory_on_device(device)
+    # 9. Decode latents if needed
+    if args.output_type != "latent":
+        logger.info("Decoding latents to videos/images")
+        if vae is None:
+            vae = load_vae(args, cfg, device, vae_dtype)
+        vae.to_device(device)
+        for i, (latent, prompt_args) in enumerate(zip(all_latents, all_prompt_args)):
+            logger.info(f"Decoding output {i+1}/{len(all_latents)}")
+            # Decode latent
+            video = decode_latent(latent.unsqueeze(0), prompt_args, cfg)
+            # Save as video or images
+            if prompt_args.output_type == "video" or prompt_args.output_type == "both":
+                save_video(video, prompt_args)
+            elif prompt_args.output_type == "images":
+                save_images(video, prompt_args)
+        # Free VAE
+        del vae
+    clean_memory_on_device(device)
+    gc.collect()
+def process_interactive(args: argparse.Namespace) -> None:
+    """Process prompts in interactive mode
+    Args:
+        args: Base command line arguments
+    """
+    gen_settings = get_generation_settings(args)
+    device, cfg, dit_dtype, dit_weight_dtype, vae_dtype = (
+        gen_settings.device,
+        gen_settings.cfg,
+        gen_settings.dit_dtype,
+        gen_settings.dit_weight_dtype,
+        gen_settings.vae_dtype,
+    )
+    is_i2v = "i2v" in args.task
+    # Initialize models to None
+    text_encoder = None
+    vae = None
+    model = None
+    clip = None
+    print("Interactive mode. Enter prompts (Ctrl+D to exit):")
+    try:
+        while True:
+            try:
+                line = input("> ")
+                if not line.strip():
+                    continue
+                # Parse prompt
+                prompt_data = parse_prompt_line(line)
+                prompt_args = apply_overrides(args, prompt_data)
+                # Ensure we have all the models we need
+                # 1. Load text encoder if not already loaded
+                if text_encoder is None:
+                    logger.info("Loading text encoder")
+                    text_encoder = load_text_encoder(args, cfg, device)
+                text_encoder.model.to(device)
+                # Encode prompt
+                n_prompt = prompt_data.get(
+                    "negative_prompt", prompt_args.negative_prompt if prompt_args.negative_prompt else cfg.sample_neg_prompt
+                )
+                with torch.no_grad():
+                    if args.fp8_t5:
+                        with torch.amp.autocast(device_type=device.type, dtype=cfg.t5_dtype):
+                            context = text_encoder([prompt_data["prompt"]], device)
+                            context_null = text_encoder([n_prompt], device)
+                    else:
+                        context = text_encoder([prompt_data["prompt"]], device)
+                        context_null = text_encoder([n_prompt], device)
+                encoded_context = {"context": context, "context_null": context_null}
+                # Move text encoder to CPU after use
+                text_encoder.model.to("cpu")
+                # 2. For I2V, we need CLIP and VAE
+                if is_i2v:
+                    if clip is None:
+                        logger.info("Loading CLIP model")
+                        clip = load_clip_model(args, cfg, device)
+                    clip.model.to(device)
+                    # Encode image with CLIP if there's an image path
+                    if prompt_args.image_path and os.path.exists(prompt_args.image_path):
+                        img = Image.open(prompt_args.image_path).convert("RGB")
+                        img_tensor = TF.to_tensor(img).sub_(0.5).div_(0.5).to(device)
+                        with torch.amp.autocast(device_type=device.type, dtype=torch.float16), torch.no_grad():
+                            clip_context = clip.visual([img_tensor[:, None, :, :]])
+                        encoded_context["clip_context"] = clip_context
+                    # Move CLIP to CPU after use
+                    clip.model.to("cpu")
+                    # Load VAE if needed
+                    if vae is None:
+                        logger.info("Loading VAE model")
+                        vae = load_vae(args, cfg, device, vae_dtype)
+                elif cfg.is_fun_control and vae is None:
+                    # For Fun-Control, we need VAE
+                    logger.info("Loading VAE model for Fun-Control")
+                    vae = load_vae(args, cfg, device, vae_dtype)
+                # 3. Load DiT model if not already loaded
+                if model is None:
+                    logger.info("Loading DiT model")
+                    model = load_dit_model(args, cfg, device, dit_dtype, dit_weight_dtype, is_i2v)
+                    # Merge LoRA weights if needed
+                    if args.lora_weight is not None and len(args.lora_weight) > 0:
+                        merge_lora_weights(lora_wan, model, args, device)
+                    # Optimize model
+                    optimize_model(model, args, device, dit_dtype, dit_weight_dtype)
+                else:
+                    # Move model to GPU if it was offloaded
+                    model.to(device)
+                # Create shared models dict
+                shared_models = {"vae": vae, "model": model, "encoded_contexts": {prompt_data["prompt"]: encoded_context}}
+                # Generate latent
+                latent = generate(prompt_args, gen_settings, shared_models)
+                # Move model to CPU after generation
+                model.to("cpu")
+                # Save latent if needed
+                height, width, _ = check_inputs(prompt_args)
+                if prompt_args.output_type == "latent" or prompt_args.output_type == "both":
+                    save_latent(latent, prompt_args, height, width)
+                # Decode and save output
+                if prompt_args.output_type != "latent":
+                    if vae is None:
+                        vae = load_vae(args, cfg, device, vae_dtype)
+                    vae.to_device(device)
+                    video = decode_latent(latent.unsqueeze(0), prompt_args, cfg)
+                    if prompt_args.output_type == "video" or prompt_args.output_type == "both":
+                        save_video(video, prompt_args)
+                    elif prompt_args.output_type == "images":
+                        save_images(video, prompt_args)
+                    # Move VAE to CPU after use
+                    vae.to_device("cpu")
+                clean_memory_on_device(device)
+            except KeyboardInterrupt:
+                print("\nInterrupted. Continue (Ctrl+D or Ctrl+Z (Windows) to exit)")
+                continue
+    except EOFError:
+        print("\nExiting interactive mode")
+    # Clean up all models
+    if text_encoder is not None:
+        del text_encoder
+    if clip is not None:
+        del clip
+    if vae is not None:
+        del vae
+    if model is not None:
+        del model
+    clean_memory_on_device(device)
+    gc.collect()
+def get_generation_settings(args: argparse.Namespace) -> GenerationSettings:
+    device = torch.device(args.device)
+    cfg = WAN_CONFIGS[args.task]
+    # select dtype
+    dit_dtype = detect_wan_sd_dtype(args.dit) if args.dit is not None else torch.bfloat16
+    if dit_dtype.itemsize == 1:
+        # if weight is in fp8, use bfloat16 for DiT (input/output)
+        dit_dtype = torch.bfloat16
+        if args.fp8_scaled:
+            raise ValueError(
+                "DiT weights is already in fp8 format, cannot scale to fp8. Please use fp16/bf16 weights / DiTの重みはすでにfp8形式です。fp8にスケーリングできません。fp16/bf16の重みを使用してください"
+            )
+    dit_weight_dtype = dit_dtype  # default
+    if args.fp8_scaled:
+        dit_weight_dtype = None  # various precision weights, so don't cast to specific dtype
+    elif args.fp8:
+        dit_weight_dtype = torch.float8_e4m3fn
+    vae_dtype = str_to_dtype(args.vae_dtype) if args.vae_dtype is not None else dit_dtype
+    logger.info(
+        f"Using device: {device}, DiT precision: {dit_dtype}, weight precision: {dit_weight_dtype}, VAE precision: {vae_dtype}"
+    )
+    gen_settings = GenerationSettings(
+        device=device,
+        cfg=cfg,
+        dit_dtype=dit_dtype,
+        dit_weight_dtype=dit_weight_dtype,
+        vae_dtype=vae_dtype,
+    )
+    return gen_settings
+def main():
+    # Parse arguments
+    args = parse_args()
+    # Check if latents are provided
+    latents_mode = args.latent_path is not None and len(args.latent_path) > 0
+    # Set device
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    logger.info(f"Using device: {device}")
+    args.device = device
+    if latents_mode:
+        # Original latent decode mode
+        cfg = WAN_CONFIGS[args.task]  # any task is fine
+        original_base_names = []
+        latents_list = []
+        seeds = []
+        assert len(args.latent_path) == 1, "Only one latent path is supported for now"
+        for latent_path in args.latent_path:
+            original_base_names.append(os.path.splitext(os.path.basename(latent_path))[0])
+            seed = 0
+            if os.path.splitext(latent_path)[1] != ".safetensors":
+                latents = torch.load(latent_path, map_location="cpu")
+            else:
+                latents = load_file(latent_path)["latent"]
+                with safe_open(latent_path, framework="pt") as f:
+                    metadata = f.metadata()
+                if metadata is None:
+                    metadata = {}
+                logger.info(f"Loaded metadata: {metadata}")
+                if "seeds" in metadata:
+                    seed = int(metadata["seeds"])
+                if "height" in metadata and "width" in metadata:
+                    height = int(metadata["height"])
+                    width = int(metadata["width"])
+                    args.video_size = [height, width]
+                if "video_length" in metadata:
+                    args.video_length = int(metadata["video_length"])
+            seeds.append(seed)
+            latents_list.append(latents)
+            logger.info(f"Loaded latent from {latent_path}. Shape: {latents.shape}")
+        latent = torch.stack(latents_list, dim=0)  # [N, ...], must be same shape
+        height = latents.shape[-2]
+        width = latents.shape[-1]
+        height *= cfg.patch_size[1] * cfg.vae_stride[1]
+        width *= cfg.patch_size[2] * cfg.vae_stride[2]
+        video_length = latents.shape[1]
+        video_length = (video_length - 1) * cfg.vae_stride[0] + 1
+        args.seed = seeds[0]
+        # Decode and save
+        save_output(latent[0], args, cfg, height, width, original_base_names)
+    elif args.from_file:
+        # Batch mode from file
+        args = setup_args(args)
+        # Read prompts from file
+        with open(args.from_file, "r", encoding="utf-8") as f:
+            prompt_lines = f.readlines()
+        # Process prompts
+        prompts_data = preprocess_prompts_for_batch(prompt_lines, args)
+        process_batch_prompts(prompts_data, args)
+    elif args.interactive:
+        # Interactive mode
+        args = setup_args(args)
+        process_interactive(args)
+    else:
+        # Single prompt mode (original behavior)
+        args = setup_args(args)
+        height, width, video_length = check_inputs(args)
+        logger.info(
+            f"Video size: {height}x{width}@{video_length} (HxW@F), fps: {args.fps}, "
+            f"infer_steps: {args.infer_steps}, flow_shift: {args.flow_shift}"
+        )
+        # Generate latent
+        gen_settings = get_generation_settings(args)
+        latent = generate(args, gen_settings)
+        # Make sure the model is freed from GPU memory
+        gc.collect()
+        clean_memory_on_device(args.device)
+        # Save latent and video
+        if args.save_merged_model:
+            return
+        # Add batch dimension
+        latent = latent.unsqueeze(0)
+        save_output(latent[0], args, WAN_CONFIGS[args.task], height, width)
+    logger.info("Done!")
+if __name__ == "__main__":
+    main()

wan_train_network.py ADDED Viewed

	@@ -0,0 +1,444 @@

+import argparse
+from typing import Optional
+from PIL import Image
+import numpy as np
+import torch
+import torchvision.transforms.functional as TF
+from tqdm import tqdm
+from accelerate import Accelerator, init_empty_weights
+from dataset.image_video_dataset import ARCHITECTURE_WAN, ARCHITECTURE_WAN_FULL, load_video
+from hv_generate_video import resize_image_to_bucket
+from hv_train_network import NetworkTrainer, load_prompts, clean_memory_on_device, setup_parser_common, read_config_from_file
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+from utils import model_utils
+from utils.safetensors_utils import load_safetensors, MemoryEfficientSafeOpen
+from wan.configs import WAN_CONFIGS
+from wan.modules.clip import CLIPModel
+from wan.modules.model import WanModel, detect_wan_sd_dtype, load_wan_model
+from wan.modules.t5 import T5EncoderModel
+from wan.modules.vae import WanVAE
+from wan.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+class WanNetworkTrainer(NetworkTrainer):
+    def __init__(self):
+        super().__init__()
+    # region model specific
+    @property
+    def architecture(self) -> str:
+        return ARCHITECTURE_WAN
+    @property
+    def architecture_full_name(self) -> str:
+        return ARCHITECTURE_WAN_FULL
+    def handle_model_specific_args(self, args):
+        self.config = WAN_CONFIGS[args.task]
+        self._i2v_training = "i2v" in args.task  # we cannot use config.i2v because Fun-Control T2V has i2v flag TODO refactor this
+        self._control_training = self.config.is_fun_control
+        self.dit_dtype = detect_wan_sd_dtype(args.dit)
+        if self.dit_dtype == torch.float16:
+            assert args.mixed_precision in ["fp16", "no"], "DiT weights are in fp16, mixed precision must be fp16 or no"
+        elif self.dit_dtype == torch.bfloat16:
+            assert args.mixed_precision in ["bf16", "no"], "DiT weights are in bf16, mixed precision must be bf16 or no"
+        if args.fp8_scaled and self.dit_dtype.itemsize == 1:
+            raise ValueError(
+                "DiT weights is already in fp8 format, cannot scale to fp8. Please use fp16/bf16 weights / DiTの重みはすでにfp8形式です。fp8にスケーリングできません。fp16/bf16の重みを使用してください"
+            )
+        # dit_dtype cannot be fp8, so we select the appropriate dtype
+        if self.dit_dtype.itemsize == 1:
+            self.dit_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
+        args.dit_dtype = model_utils.dtype_to_str(self.dit_dtype)
+        self.default_guidance_scale = 1.0  # not used
+    def process_sample_prompts(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        sample_prompts: str,
+    ):
+        config = self.config
+        device = accelerator.device
+        t5_path, clip_path, fp8_t5 = args.t5, args.clip, args.fp8_t5
+        logger.info(f"cache Text Encoder outputs for sample prompt: {sample_prompts}")
+        prompts = load_prompts(sample_prompts)
+        def encode_for_text_encoder(text_encoder):
+            sample_prompts_te_outputs = {}  # (prompt) -> (embeds, mask)
+            # with accelerator.autocast(), torch.no_grad(): # this causes NaN if dit_dtype is fp16
+            t5_dtype = config.t5_dtype
+            with torch.amp.autocast(device_type=device.type, dtype=t5_dtype), torch.no_grad():
+                for prompt_dict in prompts:
+                    if "negative_prompt" not in prompt_dict:
+                        prompt_dict["negative_prompt"] = self.config["sample_neg_prompt"]
+                    for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", None)]:
+                        if p is None:
+                            continue
+                        if p not in sample_prompts_te_outputs:
+                            logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                            prompt_outputs = text_encoder([p], device)
+                            sample_prompts_te_outputs[p] = prompt_outputs
+            return sample_prompts_te_outputs
+        # Load Text Encoder 1 and encode
+        logger.info(f"loading T5: {t5_path}")
+        t5 = T5EncoderModel(text_len=config.text_len, dtype=config.t5_dtype, device=device, weight_path=t5_path, fp8=fp8_t5)
+        logger.info("encoding with Text Encoder 1")
+        te_outputs_1 = encode_for_text_encoder(t5)
+        del t5
+        # load CLIP and encode image (for I2V training)
+        # Note: VAE encoding is done in do_inference() for I2V training, because we have VAE in the pipeline. Control video is also done in do_inference()
+        sample_prompts_image_embs = {}
+        for prompt_dict in prompts:
+            if prompt_dict.get("image_path", None) is not None and self.i2v_training:
+                sample_prompts_image_embs[prompt_dict["image_path"]] = None  # this will be replaced with CLIP context
+        if len(sample_prompts_image_embs) > 0:
+            logger.info(f"loading CLIP: {clip_path}")
+            assert clip_path is not None, "CLIP path is required for I2V training / I2V学習にはCLIPのパスが必要です"
+            clip = CLIPModel(dtype=config.clip_dtype, device=device, weight_path=clip_path)
+            clip.model.to(device)
+            logger.info(f"Encoding image to CLIP context")
+            with torch.amp.autocast(device_type=device.type, dtype=torch.float16), torch.no_grad():
+                for image_path in sample_prompts_image_embs:
+                    logger.info(f"Encoding image: {image_path}")
+                    img = Image.open(image_path).convert("RGB")
+                    img = TF.to_tensor(img).sub_(0.5).div_(0.5).to(device)  # -1 to 1
+                    clip_context = clip.visual([img[:, None, :, :]])
+                    sample_prompts_image_embs[image_path] = clip_context
+            del clip
+            clean_memory_on_device(device)
+        # prepare sample parameters
+        sample_parameters = []
+        for prompt_dict in prompts:
+            prompt_dict_copy = prompt_dict.copy()
+            p = prompt_dict.get("prompt", "")
+            prompt_dict_copy["t5_embeds"] = te_outputs_1[p][0]
+            p = prompt_dict.get("negative_prompt", None)
+            if p is not None:
+                prompt_dict_copy["negative_t5_embeds"] = te_outputs_1[p][0]
+            p = prompt_dict.get("image_path", None)
+            if p is not None and self.i2v_training:
+                prompt_dict_copy["clip_embeds"] = sample_prompts_image_embs[p]
+            sample_parameters.append(prompt_dict_copy)
+        clean_memory_on_device(accelerator.device)
+        return sample_parameters
+    def do_inference(
+        self,
+        accelerator,
+        args,
+        sample_parameter,
+        vae,
+        dit_dtype,
+        transformer,
+        discrete_flow_shift,
+        sample_steps,
+        width,
+        height,
+        frame_count,
+        generator,
+        do_classifier_free_guidance,
+        guidance_scale,
+        cfg_scale,
+        image_path=None,
+        control_video_path=None,
+    ):
+        """architecture dependent inference"""
+        model: WanModel = transformer
+        device = accelerator.device
+        if cfg_scale is None:
+            cfg_scale = 5.0
+        do_classifier_free_guidance = do_classifier_free_guidance and cfg_scale != 1.0
+        # Calculate latent video length based on VAE version
+        latent_video_length = (frame_count - 1) // self.config["vae_stride"][0] + 1
+        # Get embeddings
+        context = sample_parameter["t5_embeds"].to(device=device)
+        if do_classifier_free_guidance:
+            context_null = sample_parameter["negative_t5_embeds"].to(device=device)
+        else:
+            context_null = None
+        num_channels_latents = 16  # model.in_dim
+        vae_scale_factor = self.config["vae_stride"][1]
+        # Initialize latents
+        lat_h = height // vae_scale_factor
+        lat_w = width // vae_scale_factor
+        shape_or_frame = (1, num_channels_latents, 1, lat_h, lat_w)
+        latents = []
+        for _ in range(latent_video_length):
+            latents.append(torch.randn(shape_or_frame, generator=generator, device=device, dtype=torch.float32))
+        latents = torch.cat(latents, dim=2)
+        image_latents = None
+        if self.i2v_training or self.control_training:
+            # Move VAE to the appropriate device for sampling: consider to cache image latents in CPU in advance
+            vae.to(device)
+            vae.eval()
+            if self.i2v_training:
+                image = Image.open(image_path)
+                image = resize_image_to_bucket(image, (width, height))  # returns a numpy array
+                image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(1).float()  # C, 1, H, W
+                image = image / 127.5 - 1  # -1 to 1
+                # Create mask for the required number of frames
+                msk = torch.ones(1, frame_count, lat_h, lat_w, device=device)
+                msk[:, 1:] = 0
+                msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+                msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
+                msk = msk.transpose(1, 2)  # B, C, T, H, W
+                with torch.amp.autocast(device_type=device.type, dtype=vae.dtype), torch.no_grad():
+                    # Zero padding for the required number of frames only
+                    padding_frames = frame_count - 1  # The first frame is the input image
+                    image = torch.concat([image, torch.zeros(3, padding_frames, height, width)], dim=1).to(device=device)
+                    y = vae.encode([image])[0]
+                y = y[:, :latent_video_length]  # may be not needed
+                y = y.unsqueeze(0)  # add batch dim
+                image_latents = torch.concat([msk, y], dim=1)
+            if self.control_training:
+                # Control video
+                video = load_video(control_video_path, 0, frame_count, bucket_reso=(width, height))  # list of frames
+                video = np.stack(video, axis=0)  # F, H, W, C
+                video = torch.from_numpy(video).permute(3, 0, 1, 2).float()  # C, F, H, W
+                video = video / 127.5 - 1  # -1 to 1
+                video = video.to(device=device)
+                with torch.amp.autocast(device_type=device.type, dtype=vae.dtype), torch.no_grad():
+                    control_latents = vae.encode([video])[0]
+                    control_latents = control_latents[:, :latent_video_length]
+                    control_latents = control_latents.unsqueeze(0)  # add batch dim
+                # We supports Wan2.1-Fun-Control only
+                if image_latents is not None:
+                    image_latents = image_latents[:, 4:]  # remove mask for Wan2.1-Fun-Control
+                    image_latents[:, :, 1:] = 0  # remove except the first frame
+                else:
+                    image_latents = torch.zeros_like(control_latents)  # B, C, F, H, W
+                image_latents = torch.concat([control_latents, image_latents], dim=1)  # B, C, F, H, W
+            vae.to("cpu")
+            clean_memory_on_device(device)
+        # use the default value for num_train_timesteps (1000)
+        scheduler = FlowUniPCMultistepScheduler(shift=1, use_dynamic_shifting=False)
+        scheduler.set_timesteps(sample_steps, device=device, shift=discrete_flow_shift)
+        timesteps = scheduler.timesteps
+        # Generate noise for the required number of frames only
+        noise = torch.randn(16, latent_video_length, lat_h, lat_w, dtype=torch.float32, generator=generator, device=device).to(
+            "cpu"
+        )
+        # prepare the model input
+        max_seq_len = latent_video_length * lat_h * lat_w // (self.config.patch_size[1] * self.config.patch_size[2])
+        arg_c = {"context": [context], "seq_len": max_seq_len}
+        arg_null = {"context": [context_null], "seq_len": max_seq_len}
+        if self.i2v_training:
+            arg_c["clip_fea"] = sample_parameter["clip_embeds"].to(device=device, dtype=dit_dtype)
+            arg_null["clip_fea"] = arg_c["clip_fea"]
+        if self.i2v_training or self.control_training:
+            arg_c["y"] = image_latents
+            arg_null["y"] = image_latents
+        # Wrap the inner loop with tqdm to track progress over timesteps
+        prompt_idx = sample_parameter.get("enum", 0)
+        latent = noise
+        with torch.no_grad():
+            for i, t in enumerate(tqdm(timesteps, desc=f"Sampling timesteps for prompt {prompt_idx+1}")):
+                latent_model_input = [latent.to(device=device)]
+                timestep = t.unsqueeze(0)
+                with accelerator.autocast():
+                    noise_pred_cond = model(latent_model_input, t=timestep, **arg_c)[0].to("cpu")
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond = model(latent_model_input, t=timestep, **arg_null)[0].to("cpu")
+                    else:
+                        noise_pred_uncond = None
+                if do_classifier_free_guidance:
+                    noise_pred = noise_pred_uncond + cfg_scale * (noise_pred_cond - noise_pred_uncond)
+                else:
+                    noise_pred = noise_pred_cond
+                temp_x0 = scheduler.step(noise_pred.unsqueeze(0), t, latent.unsqueeze(0), return_dict=False, generator=generator)[0]
+                latent = temp_x0.squeeze(0)
+        # Move VAE to the appropriate device for sampling
+        vae.to(device)
+        vae.eval()
+        # Decode latents to video
+        logger.info(f"Decoding video from latents: {latent.shape}")
+        latent = latent.unsqueeze(0)  # add batch dim
+        latent = latent.to(device=device)
+        with torch.amp.autocast(device_type=device.type, dtype=vae.dtype), torch.no_grad():
+            video = vae.decode(latent)[0]  # vae returns list
+        video = video.unsqueeze(0)  # add batch dim
+        del latent
+        logger.info(f"Decoding complete")
+        video = video.to(torch.float32).cpu()
+        video = (video / 2 + 0.5).clamp(0, 1)  # -1 to 1 -> 0 to 1
+        vae.to("cpu")
+        clean_memory_on_device(device)
+        return video
+    def load_vae(self, args: argparse.Namespace, vae_dtype: torch.dtype, vae_path: str):
+        vae_path = args.vae
+        logger.info(f"Loading VAE model from {vae_path}")
+        cache_device = torch.device("cpu") if args.vae_cache_cpu else None
+        vae = WanVAE(vae_path=vae_path, device="cpu", dtype=vae_dtype, cache_device=cache_device)
+        return vae
+    def load_transformer(
+        self,
+        accelerator: Accelerator,
+        args: argparse.Namespace,
+        dit_path: str,
+        attn_mode: str,
+        split_attn: bool,
+        loading_device: str,
+        dit_weight_dtype: Optional[torch.dtype],
+    ):
+        model = load_wan_model(
+            self.config, accelerator.device, dit_path, attn_mode, split_attn, loading_device, dit_weight_dtype, args.fp8_scaled
+        )
+        return model
+    def scale_shift_latents(self, latents):
+        return latents
+    def call_dit(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        transformer,
+        latents: torch.Tensor,
+        batch: dict[str, torch.Tensor],
+        noise: torch.Tensor,
+        noisy_model_input: torch.Tensor,
+        timesteps: torch.Tensor,
+        network_dtype: torch.dtype,
+    ):
+        model: WanModel = transformer
+        # I2V training and Control training
+        image_latents = None
+        clip_fea = None
+        if self.i2v_training:
+            image_latents = batch["latents_image"]
+            image_latents = image_latents.to(device=accelerator.device, dtype=network_dtype)
+            clip_fea = batch["clip"]
+            clip_fea = clip_fea.to(device=accelerator.device, dtype=network_dtype)
+        if self.control_training:
+            control_latents = batch["latents_control"]
+            control_latents = control_latents.to(device=accelerator.device, dtype=network_dtype)
+            if image_latents is not None:
+                image_latents = image_latents[:, 4:]  # remove mask for Wan2.1-Fun-Control
+                image_latents[:, :, 1:] = 0  # remove except the first frame
+            else:
+                image_latents = torch.zeros_like(control_latents)  # B, C, F, H, W
+            image_latents = torch.concat([control_latents, image_latents], dim=1)  # B, C, F, H, W
+            control_latents = None
+        context = [t.to(device=accelerator.device, dtype=network_dtype) for t in batch["t5"]]
+        # ensure the hidden state will require grad
+        if args.gradient_checkpointing:
+            noisy_model_input.requires_grad_(True)
+            for t in context:
+                t.requires_grad_(True)
+            if image_latents is not None:
+                image_latents.requires_grad_(True)
+            if clip_fea is not None:
+                clip_fea.requires_grad_(True)
+        # call DiT
+        lat_f, lat_h, lat_w = latents.shape[2:5]
+        seq_len = lat_f * lat_h * lat_w // (self.config.patch_size[0] * self.config.patch_size[1] * self.config.patch_size[2])
+        latents = latents.to(device=accelerator.device, dtype=network_dtype)
+        noisy_model_input = noisy_model_input.to(device=accelerator.device, dtype=network_dtype)
+        with accelerator.autocast():
+            model_pred = model(noisy_model_input, t=timesteps, context=context, clip_fea=clip_fea, seq_len=seq_len, y=image_latents)
+        model_pred = torch.stack(model_pred, dim=0)  # list to tensor
+        # flow matching loss
+        target = noise - latents
+        return model_pred, target
+    # endregion model specific
+def wan_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    """Wan2.1 specific parser setup"""
+    parser.add_argument("--task", type=str, default="t2v-14B", choices=list(WAN_CONFIGS.keys()), help="The task to run.")
+    parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT / DiTにスケーリングされたfp8を使う")
+    parser.add_argument("--t5", type=str, default=None, help="text encoder (T5) checkpoint path")
+    parser.add_argument("--fp8_t5", action="store_true", help="use fp8 for Text Encoder model")
+    parser.add_argument(
+        "--clip",
+        type=str,
+        default=None,
+        help="text encoder (CLIP) checkpoint path, optional. If training I2V model, this is required",
+    )
+    parser.add_argument("--vae_cache_cpu", action="store_true", help="cache features in VAE on CPU")
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser_common()
+    parser = wan_setup_parser(parser)
+    args = parser.parse_args()
+    args = read_config_from_file(args, parser)
+    args.dit_dtype = None  # automatically detected
+    if args.vae_dtype is None:
+        args.vae_dtype = "bfloat16"  # make bfloat16 as default for VAE
+    trainer = WanNetworkTrainer()
+    trainer.train(args)