svjack commited on Jun 5

Commit

4d42572

verified ·

1 Parent(s): 71e89ad

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
cache_latents.py +61 -3
cache_text_encoder_outputs.py +3 -3
convert_lora.py +2 -0
dataset/config_utils.py +30 -2
dataset/dataset_config.md +231 -80
dataset/image_video_dataset.py +561 -72
docs/advanced_config.md +166 -1
docs/framepack.md +607 -0
docs/framepack_1f.md +359 -0
docs/kisekaeichi_ref.png +3 -0
docs/kisekaeichi_ref_mask.png +0 -0
docs/kisekaeichi_result.png +3 -0
docs/kisekaeichi_start.png +3 -0
docs/kisekaeichi_start_mask.png +0 -0
docs/sampling_during_training.md +18 -10
docs/wan.md +302 -12
fpack_cache_latents.py +524 -0
fpack_cache_text_encoder_outputs.py +110 -0
fpack_generate_video.py +1832 -0
fpack_train_network.py +617 -0
frame_pack/__init__.py +0 -0
frame_pack/bucket_tools.py +30 -0
frame_pack/clip_vision.py +14 -0
frame_pack/framepack_utils.py +273 -0
frame_pack/hunyuan.py +134 -0
frame_pack/hunyuan_video_packed.py +2038 -0
frame_pack/k_diffusion_hunyuan.py +128 -0
frame_pack/uni_pc_fm.py +142 -0
frame_pack/utils.py +617 -0
frame_pack/wrapper.py +51 -0
hunyuan_model/fp8_optimization.py +39 -0
hv_generate_video.py +52 -27
hv_train_network.py +110 -11
merge_lora.py +1 -1
modules/fp8_optimization_utils.py +356 -0
networks/lora.py +0 -1
networks/lora_framepack.py +65 -0
pyproject.toml +4 -2
requirements.txt +4 -4
utils/safetensors_utils.py +31 -1
utils/sai_model_spec.py +10 -2
utils/train_utils.py +1 -0
wan/__init__.py +0 -2
wan/configs/__init__.py +46 -19
wan/configs/shared_config.py +1 -0
wan/configs/wan_i2v_14B.py +11 -8
wan/configs/wan_t2v_14B.py +8 -5
wan/configs/wan_t2v_1_3B.py +8 -5
wan/modules/model.py +135 -5

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/kisekaeichi_ref.png filter=lfs diff=lfs merge=lfs -text
+docs/kisekaeichi_result.png filter=lfs diff=lfs merge=lfs -text
+docs/kisekaeichi_start.png filter=lfs diff=lfs merge=lfs -text

cache_latents.py CHANGED Viewed

@@ -86,10 +86,65 @@ def show_console(
     return ord(k) if k else ord(" ")
 def show_datasets(
-    datasets: list[BaseDataset], debug_mode: str, console_width: int, console_back: str, console_num_images: Optional[int]
 ):
-    print(f"d: next dataset, q: quit")
     num_workers = max(1, os.cpu_count() - 1)
     for i, dataset in enumerate(datasets):
@@ -110,6 +165,9 @@ def show_datasets(
                         num_images_to_show -= 1
                         if num_images_to_show == 0:
                             k = ord("d")  # next dataset
                 if k == ord("q"):
                     return
@@ -246,7 +304,7 @@ def setup_parser_common() -> argparse.ArgumentParser:
     parser.add_argument("--num_workers", type=int, default=None, help="number of workers for dataset. default is cpu count-1")
     parser.add_argument("--skip_existing", action="store_true", help="skip existing cache files")
     parser.add_argument("--keep_cache", action="store_true", help="keep cache files not in dataset")
-    parser.add_argument("--debug_mode", type=str, default=None, choices=["image", "console"], help="debug mode")
     parser.add_argument("--console_width", type=int, default=80, help="debug mode: console width")
     parser.add_argument(
         "--console_back", type=str, default=None, help="debug mode: console background color, one of ascii_magic.Back"

     return ord(k) if k else ord(" ")
+def save_video(image: Union[list[Union[Image.Image, np.ndarray], Union[Image.Image, np.ndarray]]], cache_path: str, fps: int = 24):
+    import av
+    directory = os.path.dirname(cache_path)
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    if (isinstance(image, np.ndarray) and len(image.shape) == 3) or isinstance(image, Image.Image):
+        # save image
+        image_path = cache_path.replace(".safetensors", ".jpg")
+        img = image if isinstance(image, Image.Image) else Image.fromarray(image)
+        img.save(image_path)
+        print(f"Saved image: {image_path}")
+    else:
+        imgs = image
+        print(f"Number of images: {len(imgs)}")
+        # save video
+        video_path = cache_path.replace(".safetensors", ".mp4")
+        height, width = imgs[0].shape[0:2]
+        # create output container
+        container = av.open(video_path, mode="w")
+        # create video stream
+        codec = "libx264"
+        pixel_format = "yuv420p"
+        stream = container.add_stream(codec, rate=fps)
+        stream.width = width
+        stream.height = height
+        stream.pix_fmt = pixel_format
+        stream.bit_rate = 1000000  # 1Mbit/s for preview quality
+        for frame_img in imgs:
+            if isinstance(frame_img, Image.Image):
+                frame = av.VideoFrame.from_image(frame_img)
+            else:
+                frame = av.VideoFrame.from_ndarray(frame_img, format="rgb24")
+            packets = stream.encode(frame)
+            for packet in packets:
+                container.mux(packet)
+        for packet in stream.encode():
+            container.mux(packet)
+        container.close()
+        print(f"Saved video: {video_path}")
 def show_datasets(
+    datasets: list[BaseDataset],
+    debug_mode: str,
+    console_width: int,
+    console_back: str,
+    console_num_images: Optional[int],
+    fps: int = 24,
 ):
+    if debug_mode != "video":
+        print(f"d: next dataset, q: quit")
     num_workers = max(1, os.cpu_count() - 1)
     for i, dataset in enumerate(datasets):
                         num_images_to_show -= 1
                         if num_images_to_show == 0:
                             k = ord("d")  # next dataset
+                elif debug_mode == "video":
+                    save_video(item_info.content, item_info.latent_cache_path, fps)
+                    k = None  # save next video
                 if k == ord("q"):
                     return
     parser.add_argument("--num_workers", type=int, default=None, help="number of workers for dataset. default is cpu count-1")
     parser.add_argument("--skip_existing", action="store_true", help="skip existing cache files")
     parser.add_argument("--keep_cache", action="store_true", help="keep cache files not in dataset")
+    parser.add_argument("--debug_mode", type=str, default=None, choices=["image", "console", "video"], help="debug mode")
     parser.add_argument("--console_width", type=int, default=80, help="debug mode: console width")
     parser.add_argument(
         "--console_back", type=str, default=None, help="debug mode: console background color, one of ascii_magic.Back"

cache_text_encoder_outputs.py CHANGED Viewed

@@ -100,14 +100,14 @@ def process_text_encoder_batches(
 def post_process_cache_files(
-    datasets: list[BaseDataset], all_cache_files_for_dataset: list[set], all_cache_paths_for_dataset: list[set]
 ):
     for i, dataset in enumerate(datasets):
         all_cache_files = all_cache_files_for_dataset[i]
         all_cache_paths = all_cache_paths_for_dataset[i]
         for cache_file in all_cache_files:
             if cache_file not in all_cache_paths:
-                if args.keep_cache:
                     logger.info(f"Keep cache file not in the dataset: {cache_file}")
                 else:
                     os.remove(cache_file)
@@ -181,7 +181,7 @@ def main(args):
     del text_encoder_2
     # remove cache files not in dataset
-    post_process_cache_files(datasets, all_cache_files_for_dataset, all_cache_paths_for_dataset)
 def setup_parser_common():

 def post_process_cache_files(
+    datasets: list[BaseDataset], all_cache_files_for_dataset: list[set], all_cache_paths_for_dataset: list[set], keep_cache: bool
 ):
     for i, dataset in enumerate(datasets):
         all_cache_files = all_cache_files_for_dataset[i]
         all_cache_paths = all_cache_paths_for_dataset[i]
         for cache_file in all_cache_files:
             if cache_file not in all_cache_paths:
+                if keep_cache:
                     logger.info(f"Keep cache file not in the dataset: {cache_file}")
                 else:
                     os.remove(cache_file)
     del text_encoder_2
     # remove cache files not in dataset
+    post_process_cache_files(datasets, all_cache_files_for_dataset, all_cache_paths_for_dataset, args.keep_cache)
 def setup_parser_common():

convert_lora.py CHANGED Viewed

@@ -65,6 +65,8 @@ def convert_to_diffusers(prefix, weights_sd):
                 # Wan2.1 lora name to module name: ugly but works
                 module_name = module_name.replace("cross.attn", "cross_attn")  # fix cross attn
                 module_name = module_name.replace("self.attn", "self_attn")  # fix self attn
             else:
                 # HunyuanVideo lora name to module name: ugly but works
                 module_name = module_name.replace("double.blocks.", "double_blocks.")  # fix double blocks

                 # Wan2.1 lora name to module name: ugly but works
                 module_name = module_name.replace("cross.attn", "cross_attn")  # fix cross attn
                 module_name = module_name.replace("self.attn", "self_attn")  # fix self attn
+                module_name = module_name.replace("k.img", "k_img")  # fix k img
+                module_name = module_name.replace("v.img", "v_img")  # fix v img
             else:
                 # HunyuanVideo lora name to module name: ugly but works
                 module_name = module_name.replace("double.blocks.", "double_blocks.")  # fix double blocks

dataset/config_utils.py CHANGED Viewed

@@ -41,16 +41,29 @@ class BaseDatasetParams:
 class ImageDatasetParams(BaseDatasetParams):
     image_directory: Optional[str] = None
     image_jsonl_file: Optional[str] = None
 @dataclass
 class VideoDatasetParams(BaseDatasetParams):
     video_directory: Optional[str] = None
     video_jsonl_file: Optional[str] = None
     target_frames: Sequence[int] = (1,)
     frame_extraction: Optional[str] = "head"
     frame_stride: Optional[int] = 1
     frame_sample: Optional[int] = 1
 @dataclass
@@ -99,15 +112,23 @@ class ConfigSanitizer:
         "image_directory": str,
         "image_jsonl_file": str,
         "cache_directory": str,
     }
     VIDEO_DATASET_DISTINCT_SCHEMA = {
         "video_directory": str,
         "video_jsonl_file": str,
         "target_frames": [int],
         "frame_extraction": str,
         "frame_stride": int,
         "frame_sample": int,
         "cache_directory": str,
     }
     # options handled by argparse but not handled by user config
@@ -126,7 +147,7 @@ class ConfigSanitizer:
         )
         def validate_flex_dataset(dataset_config: dict):
-            if "target_frames" in dataset_config:
                 return Schema(self.video_dataset_schema)(dataset_config)
             else:
                 return Schema(self.image_dataset_schema)(dataset_config)
@@ -194,7 +215,7 @@ class BlueprintGenerator:
         dataset_blueprints = []
         for dataset_config in sanitized_user_config.get("datasets", []):
-            is_image_dataset = "target_frames" not in dataset_config
             if is_image_dataset:
                 dataset_params_klass = ImageDatasetParams
             else:
@@ -277,6 +298,10 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
                     f"""\
         image_directory: "{dataset.image_directory}"
         image_jsonl_file: "{dataset.image_jsonl_file}"
     \n"""
                 ),
                 "    ",
@@ -287,10 +312,13 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
                     f"""\
         video_directory: "{dataset.video_directory}"
         video_jsonl_file: "{dataset.video_jsonl_file}"
         target_frames: {dataset.target_frames}
         frame_extraction: {dataset.frame_extraction}
         frame_stride: {dataset.frame_stride}
         frame_sample: {dataset.frame_sample}
     \n"""
                 ),
                 "    ",

 class ImageDatasetParams(BaseDatasetParams):
     image_directory: Optional[str] = None
     image_jsonl_file: Optional[str] = None
+    control_directory: Optional[str] = None
+    # FramePack dependent parameters
+    fp_latent_window_size: Optional[int] = 9
+    fp_1f_clean_indices: Optional[Sequence[int]] = None
+    fp_1f_target_index: Optional[int] = None
+    fp_1f_no_post: Optional[bool] = False
 @dataclass
 class VideoDatasetParams(BaseDatasetParams):
     video_directory: Optional[str] = None
     video_jsonl_file: Optional[str] = None
+    control_directory: Optional[str] = None
     target_frames: Sequence[int] = (1,)
     frame_extraction: Optional[str] = "head"
     frame_stride: Optional[int] = 1
     frame_sample: Optional[int] = 1
+    max_frames: Optional[int] = 129
+    source_fps: Optional[float] = None
+    # FramePack dependent parameters
+    fp_latent_window_size: Optional[int] = 9
 @dataclass
         "image_directory": str,
         "image_jsonl_file": str,
         "cache_directory": str,
+        "control_directory": str,
+        "fp_latent_window_size": int,
+        "fp_1f_clean_indices": [int],
+        "fp_1f_target_index": int,
+        "fp_1f_no_post": bool,
     }
     VIDEO_DATASET_DISTINCT_SCHEMA = {
         "video_directory": str,
         "video_jsonl_file": str,
+        "control_directory": str,
         "target_frames": [int],
         "frame_extraction": str,
         "frame_stride": int,
         "frame_sample": int,
+        "max_frames": int,
         "cache_directory": str,
+        "source_fps": float,
     }
     # options handled by argparse but not handled by user config
         )
         def validate_flex_dataset(dataset_config: dict):
+            if "video_directory" in dataset_config or "video_jsonl_file" in dataset_config:
                 return Schema(self.video_dataset_schema)(dataset_config)
             else:
                 return Schema(self.image_dataset_schema)(dataset_config)
         dataset_blueprints = []
         for dataset_config in sanitized_user_config.get("datasets", []):
+            is_image_dataset = "image_directory" in dataset_config or "image_jsonl_file" in dataset_config
             if is_image_dataset:
                 dataset_params_klass = ImageDatasetParams
             else:
                     f"""\
         image_directory: "{dataset.image_directory}"
         image_jsonl_file: "{dataset.image_jsonl_file}"
+        fp_latent_window_size: {dataset.fp_latent_window_size}
+        fp_1f_clean_indices: {dataset.fp_1f_clean_indices}
+        fp_1f_target_index: {dataset.fp_1f_target_index}
+        fp_1f_no_post: {dataset.fp_1f_no_post}
     \n"""
                 ),
                 "    ",
                     f"""\
         video_directory: "{dataset.video_directory}"
         video_jsonl_file: "{dataset.video_jsonl_file}"
+        control_directory: "{dataset.control_directory}"
         target_frames: {dataset.target_frames}
         frame_extraction: {dataset.frame_extraction}
         frame_stride: {dataset.frame_stride}
         frame_sample: {dataset.frame_sample}
+        max_frames: {dataset.max_frames}
+        source_fps: {dataset.source_fps}
     \n"""
                 ),
                 "    ",

dataset/dataset_config.md CHANGED Viewed

@@ -2,16 +2,13 @@
 ## Dataset Configuration
-<details>
-<summary>English</summary>
 Please create a TOML file for dataset configuration.
 Image and video datasets are supported. The configuration file can include multiple datasets, either image or video datasets, with caption text files or metadata JSONL files.
 The cache directory must be different for each dataset.
-</details>
 <details>
 <summary>日本語</summary>
@@ -20,6 +17,8 @@ The cache directory must be different for each dataset.
 画像データセットと動画データセットがサポートされています。設定ファイルには、画像または動画データセットを複数含めることができます。キャプションテキストファイルまたはメタデータJSONLファイルを使用できます。
 キャッシュディレクトリは、各データセットごとに異なるディレクトリである必要があります。
 </details>
 ### Sample for Image Dataset with Caption Text Files
@@ -44,15 +43,10 @@ num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset.
 # other datasets can be added here. each dataset can have different configurations
 ```
-<details>
-<summary>English</summary>
 `cache_directory` is optional, default is None to use the same directory as the image directory. However, we recommend to set the cache directory to avoid accidental sharing of the cache files between different datasets.
 `num_repeats` is also available. It is optional, default is 1 (no repeat). It repeats the images (or videos) that many times to expand the dataset. For example, if `num_repeats = 2` and there are 20 images in the dataset, each image will be duplicated twice (with the same caption) to have a total of 40 images. It is useful to balance the multiple datasets with different sizes.
-</details>
 <details>
 <summary>日本語</summary>
@@ -108,9 +102,10 @@ metadata jsonl ファイルを使用する場合、caption_extension は必要
 ### Sample for Video Dataset with Caption Text Files
 ```toml
-# resolution, caption_extension, target_frames, frame_extraction, frame_stride, frame_sample,
-# batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
-# num_repeats is also available for video dataset, example is not shown here
 # general configurations
 [general]
@@ -125,14 +120,38 @@ video_directory = "/path/to/video_dir"
 cache_directory = "/path/to/cache_directory" # recommended to set cache directory
 target_frames = [1, 25, 45]
 frame_extraction = "head"
 # other datasets can be added here. each dataset can have different configurations
 ```
 <details>
 <summary>日本語</summary>
-resolution, caption_extension, target_frames, frame_extraction, frame_stride, frame_sample, batch_size, num_repeats, enable_bucket, bucket_no_upscale は general または datasets のどちらかに設定してください。
 他の注意事項は画像データセットと同様です。
 </details>
@@ -140,8 +159,11 @@ resolution, caption_extension, target_frames, frame_extraction, frame_stride, fr
 ### Sample for Video Dataset with Metadata JSONL File
 ```toml
-# resolution, target_frames, frame_extraction, frame_stride, frame_sample,
-# batch_size, num_repeats, enable_bucket, bucket_no_upscale should be set in either general or datasets
 # caption_extension is not required for metadata jsonl file
 # cache_directory is required for each dataset with metadata jsonl file
@@ -157,7 +179,7 @@ video_jsonl_file = "/path/to/metadata.jsonl"
 target_frames = [1, 25, 45]
 frame_extraction = "head"
 cache_directory = "/path/to/cache_directory_head"
 # same metadata jsonl file can be used for multiple datasets
 [[datasets]]
 video_jsonl_file = "/path/to/metadata.jsonl"
@@ -175,28 +197,30 @@ JSONL file format for metadata:
 {"video_path": "/path/to/video2.mp4", "caption": "A caption for video2"}
 ```
 <details>
 <summary>日本語</summary>
-resolution, target_frames, frame_extraction, frame_stride, frame_sample, batch_size, num_repeats, enable_bucket, bucket_no_upscale は general または datasets のどちらかに設定してください。
 metadata jsonl ファイルを使用する場合、caption_extension は必要ありません。また、cache_directory は必須です。
 他の注意事項は今までのデータセットと同様です。
 </details>
 ### frame_extraction Options
-<details>
-<summary>English</summary>
 - `head`: Extract the first N frames from the video.
 - `chunk`: Extract frames by splitting the video into chunks of N frames.
 - `slide`: Extract frames from the video with a stride of `frame_stride`.
 - `uniform`: Extract `frame_sample` samples uniformly from the video.
 For example, consider a video with 40 frames. The following diagrams illustrate each extraction:
-</details>
 <details>
 <summary>日本語</summary>
@@ -205,6 +229,11 @@ For example, consider a video with 40 frames. The following diagrams illustrate
 - `chunk`: 動画をNフレームずつに分割してフレームを抽出します。
 - `slide`: `frame_stride`に指定したフレームごとに動画からNフレームを抽出します。
 - `uniform`: 動画から一定間隔で、`frame_sample`個のNフレームを抽出します。
 例えば、40フレームの動画を例とした抽出について、以下の図で説明します。
 </details>
@@ -251,100 +280,209 @@ xxxxxxxxxxxxxxxxxxxxxxxxxooooooooooooooo
 oooooxxxxxxxxxxxxxxxxxxxxxxxxxoooooooooo
 ooooooooooxxxxxxxxxxxxxxxxxxxxxxxxxooooo
 oooooooooooooooxxxxxxxxxxxxxxxxxxxxxxxxx
 ```
-## Specifications
-```toml
-# general configurations
-[general]
-resolution = [960, 544] # optional, [W, H], default is None. This is the default resolution for all datasets
-caption_extension = ".txt" # optional, default is None. This is the default caption extension for all datasets
-batch_size = 1 # optional, default is 1. This is the default batch size for all datasets
-num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset. Useful to balance the multiple datasets with different sizes.
-enable_bucket = true # optional, default is false. Enable bucketing for datasets
-bucket_no_upscale = false # optional, default is false. Disable upscaling for bucketing. Ignored if enable_bucket is false
-### Image Dataset
-# sample image dataset with caption text files
-[[datasets]]
-image_directory = "/path/to/image_dir"
-caption_extension = ".txt" # required for caption text files, if general caption extension is not set
-resolution = [960, 544] # required if general resolution is not set
-batch_size = 4 # optional, overwrite the default batch size
-num_repeats = 1 # optional, overwrite the default num_repeats
-enable_bucket = false # optional, overwrite the default bucketing setting
-bucket_no_upscale = true # optional, overwrite the default bucketing setting
-cache_directory = "/path/to/cache_directory" # optional, default is None to use the same directory as the image directory. NOTE: caching is always enabled
-# sample image dataset with metadata **jsonl** file
-[[datasets]]
-image_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of image files and captions
-resolution = [960, 544] # required if general resolution is not set
-cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
-# caption_extension is not required for metadata jsonl file
-# batch_size, num_repeats, enable_bucket, bucket_no_upscale are also available for metadata jsonl file
-### Video Dataset
-# sample video dataset with caption text files
 [[datasets]]
 video_directory = "/path/to/video_dir"
-caption_extension = ".txt" # required for caption text files, if general caption extension is not set
-resolution = [960, 544] # required if general resolution is not set
-target_frames = [1, 25, 79] # required for video dataset. list of video lengths to extract frames. each element must be N*4+1 (N=0,1,2,...)
-# NOTE: Please do not include 1 in target_frames if you are using the frame_extraction "chunk". It will make the all frames to be extracted.
-frame_extraction = "head" # optional, "head" or "chunk", "slide", "uniform". Default is "head"
-frame_stride = 1 # optional, default is 1, available for "slide" frame extraction
-frame_sample = 4 # optional, default is 1 (same as "head"), available for "uniform" frame extraction
-# batch_size, num_repeats, enable_bucket, bucket_no_upscale, cache_directory are also available for video dataset
-# sample video dataset with metadata jsonl file
 [[datasets]]
-video_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of video files and captions
-target_frames = [1, 79]
-cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
-# frame_extraction, frame_stride, frame_sample are also available for metadata jsonl file
 ```
-<!--
-# sample image dataset with lance
 [[datasets]]
-image_lance_dataset = "/path/to/lance_dataset"
-resolution = [960, 544] # required if general resolution is not set
-# batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for lance dataset
--->
-The metadata with .json file will be supported in the near future.
-<!--
 ```toml
 # general configurations
 [general]
-resolution = [960, 544] # optional, [W, H], default is None. This is the default resolution for all datasets
 caption_extension = ".txt" # optional, default is None. This is the default caption extension for all datasets
 batch_size = 1 # optional, default is 1. This is the default batch size for all datasets
 enable_bucket = true # optional, default is false. Enable bucketing for datasets
 bucket_no_upscale = false # optional, default is false. Disable upscaling for bucketing. Ignored if enable_bucket is false
 # sample image dataset with caption text files
 [[datasets]]
 image_directory = "/path/to/image_dir"
 caption_extension = ".txt" # required for caption text files, if general caption extension is not set
 resolution = [960, 544] # required if general resolution is not set
 batch_size = 4 # optional, overwrite the default batch size
 enable_bucket = false # optional, overwrite the default bucketing setting
 bucket_no_upscale = true # optional, overwrite the default bucketing setting
 cache_directory = "/path/to/cache_directory" # optional, default is None to use the same directory as the image directory. NOTE: caching is always enabled
 # sample image dataset with metadata **jsonl** file
 [[datasets]]
@@ -352,36 +490,49 @@ image_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of image files and
 resolution = [960, 544] # required if general resolution is not set
 cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
 # caption_extension is not required for metadata jsonl file
-# batch_size, enable_bucket, bucket_no_upscale are also available for metadata jsonl file
 # sample video dataset with caption text files
 [[datasets]]
 video_directory = "/path/to/video_dir"
 caption_extension = ".txt" # required for caption text files, if general caption extension is not set
 resolution = [960, 544] # required if general resolution is not set
 target_frames = [1, 25, 79] # required for video dataset. list of video lengths to extract frames. each element must be N*4+1 (N=0,1,2,...)
 frame_extraction = "head" # optional, "head" or "chunk", "slide", "uniform". Default is "head"
 frame_stride = 1 # optional, default is 1, available for "slide" frame extraction
 frame_sample = 4 # optional, default is 1 (same as "head"), available for "uniform" frame extraction
-# batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for video dataset
 # sample video dataset with metadata jsonl file
 [[datasets]]
 video_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of video files and captions
 target_frames = [1, 79]
 cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
-# frame_extraction, frame_stride, frame_sample are also available for metadata jsonl file
 ```
 # sample image dataset with lance
 [[datasets]]
 image_lance_dataset = "/path/to/lance_dataset"
 resolution = [960, 544] # required if general resolution is not set
 # batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for lance dataset
 The metadata with .json file will be supported in the near future.
--->

 ## Dataset Configuration
 Please create a TOML file for dataset configuration.
 Image and video datasets are supported. The configuration file can include multiple datasets, either image or video datasets, with caption text files or metadata JSONL files.
 The cache directory must be different for each dataset.
+Each video is extracted frame by frame without additional processing and used for training. It is recommended to use videos with a frame rate of 24fps for HunyuanVideo, 16fps for Wan2.1 and 30fps for FramePack. You can check the videos that will be trained using `--debug_mode video` when caching latent (see [here](/README.md#latent-caching)).
 <details>
 <summary>日本語</summary>
 画像データセットと動画データセットがサポートされています。設定ファイルには、画像または動画データセットを複数含めることができます。キャプションテキストファイルまたはメタデータJSONLファイルを使用できます。
 キャッシュディレクトリは、各データセットごとに異なるディレクトリである必要があります。
+動画は追加のプロセスなしでフレームごとに抽出され、学習に用いられます。そのため、HunyuanVideoは24fps、Wan2.1は16fps、FramePackは30fpsのフレームレートの動画を使用することをお勧めします。latentキャッシュ時の`--debug_mode video`を使用すると、学習される動画を確認できます（[こちら](/README.ja.md#latentの事前キャッシュ)を参照）。
 </details>
 ### Sample for Image Dataset with Caption Text Files
 # other datasets can be added here. each dataset can have different configurations
 ```
 `cache_directory` is optional, default is None to use the same directory as the image directory. However, we recommend to set the cache directory to avoid accidental sharing of the cache files between different datasets.
 `num_repeats` is also available. It is optional, default is 1 (no repeat). It repeats the images (or videos) that many times to expand the dataset. For example, if `num_repeats = 2` and there are 20 images in the dataset, each image will be duplicated twice (with the same caption) to have a total of 40 images. It is useful to balance the multiple datasets with different sizes.
 <details>
 <summary>日本語</summary>
 ### Sample for Video Dataset with Caption Text Files
 ```toml
+# Common parameters (resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale)
+# can be set in either general or datasets sections
+# Video-specific parameters (target_frames, frame_extraction, frame_stride, frame_sample, max_frames, source_fps)
+# must be set in each datasets section
 # general configurations
 [general]
 cache_directory = "/path/to/cache_directory" # recommended to set cache directory
 target_frames = [1, 25, 45]
 frame_extraction = "head"
+source_fps = 30.0 # optional, source fps for videos in the directory, decimal number
+[[datasets]]
+video_directory = "/path/to/video_dir2"
+cache_directory = "/path/to/cache_directory2" # recommended to set cache directory
+frame_extraction = "full"
+max_frames = 45
 # other datasets can be added here. each dataset can have different configurations
 ```
+__In HunyuanVideo and Wan2.1, the number of `target_frames` must be "N\*4+1" (N=0,1,2,...).__ Otherwise, it will be truncated to the nearest "N*4+1".
+In FramePack, it is recommended to set `frame_extraction` to `full` and `max_frames` to a sufficiently large value, as it can handle longer videos. However, if the video is too long, an Out of Memory error may occur during VAE encoding. The videos in FramePack are trimmed to "N * latent_window_size * 4 + 1" frames (for example, 37, 73, 109... if `latent_window_size` is 9).
+If the `source_fps` is specified, the videos in the directory are considered to be at this frame rate, and some frames will be skipped to match the model's frame rate (24 for HunyuanVideo and 16 for Wan2.1). __The value must be a decimal number, for example, `30.0` instead of `30`.__ The skipping is done automatically and does not consider the content of the images. Please check if the converted data is correct using `--debug_mode video`.
+If `source_fps` is not specified (default), all frames of the video will be used regardless of the video's frame rate.
 <details>
 <summary>日本語</summary>
+共通パラメータ（resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale）は、generalまたはdatasetsのいずれかに設定できます。
+動画固有のパラメータ（target_frames, frame_extraction, frame_stride, frame_sample, max_frames, source_fps）は、各datasetsセクションに設定する必要があります。
+__HunyuanVideoおよびWan2.1では、target_framesの数値は「N\*4+1」である必要があります。__ これ以外の値の場合は、最も近いN\*4+1の値に切り捨てられます。
+FramePackでも同様ですが、FramePackでは動画が長くても学習可能なため、 `frame_extraction`に`full` を指定し、`max_frames`を十分に大きな値に設定することをお勧めします。ただし、あまりにも長すぎるとVAEのencodeでOut of Memoryエラーが発生する可能性があります。FramePackの動画は、「N * latent_window_size * 4 + 1」フレームにトリミングされます（latent_window_sizeが9の場合、37、73、109……）。
+`source_fps`を指定した場合、ディレクトリ内の動画をこのフレームレートとみなして、モデルのフレームレートにあうようにいくつかのフレームをスキップします（HunyuanVideoは24、Wan2.1は16）。__小数点を含む数値で指定してください。__ 例：`30`ではなく`30.0`。スキップは機械的に行われ、画像の内容は考慮しません。変換後のデータが正しいか、`--debug_mode video`で確認してください。
+`source_fps`を指定しない場合、動画のフレームは（動画自体のフレームレートに関係なく）すべて使用されます。
 他の注意事項は画像データセットと同様です。
 </details>
 ### Sample for Video Dataset with Metadata JSONL File
 ```toml
+# Common parameters (resolution, caption_extension, batch_size, num_repeats, enable_bucket, bucket_no_upscale)
+# can be set in either general or datasets sections
+# Video-specific parameters (target_frames, frame_extraction, frame_stride, frame_sample, max_frames, source_fps)
+# must be set in each datasets section
 # caption_extension is not required for metadata jsonl file
 # cache_directory is required for each dataset with metadata jsonl file
 target_frames = [1, 25, 45]
 frame_extraction = "head"
 cache_directory = "/path/to/cache_directory_head"
+source_fps = 30.0 # optional, source fps for videos in the jsonl file
 # same metadata jsonl file can be used for multiple datasets
 [[datasets]]
 video_jsonl_file = "/path/to/metadata.jsonl"
 {"video_path": "/path/to/video2.mp4", "caption": "A caption for video2"}
 ```
+`video_path` can be a directory containing multiple images.
 <details>
 <summary>日本語</summary>
 metadata jsonl ファイルを使用する場合、caption_extension は必要ありません。また、cache_directory は必須です。
+`video_path`は、複数の画像を含むディレクトリのパスでも構いません。
 他の注意事項は今までのデータセットと同様です。
 </details>
 ### frame_extraction Options
 - `head`: Extract the first N frames from the video.
 - `chunk`: Extract frames by splitting the video into chunks of N frames.
 - `slide`: Extract frames from the video with a stride of `frame_stride`.
 - `uniform`: Extract `frame_sample` samples uniformly from the video.
+- `full`: Extract all frames from the video.
+In the case of `full`, the entire video is used, but it is trimmed to "N*4+1" frames. It is also trimmed to the `max_frames` if it exceeds that value. To avoid Out of Memory errors, please set `max_frames`.
+The frame extraction methods other than `full` are recommended when the video contains repeated actions. `full` is recommended when each video represents a single complete motion.
 For example, consider a video with 40 frames. The following diagrams illustrate each extraction:
 <details>
 <summary>日本語</summary>
 - `chunk`: 動画をNフレームずつに分割してフレームを抽出します。
 - `slide`: `frame_stride`に指定したフレームごとに動画からNフレームを抽出します。
 - `uniform`: 動画から一定間隔で、`frame_sample`個のNフレームを抽出します。
+- `full`: 動画から全てのフレームを抽出します。
+`full`の場合、各動画の全体を用いますが、「N*4+1」のフレーム数にトリミングされます。また`max_frames`を超える場合もその値にトリミングされます。Out of Memoryエラーを避けるために、`max_frames`を設定してください。
+`full`以外の抽出方法は、動画が特定の動作を繰り返している場合にお勧めします。`full`はそれぞれの動画がひとつの完結したモーションの場合にお勧めします。
 例えば、40フレームの動画を例とした抽出について、以下の図で説明します。
 </details>
 oooooxxxxxxxxxxxxxxxxxxxxxxxxxoooooooooo
 ooooooooooxxxxxxxxxxxxxxxxxxxxxxxxxooooo
 oooooooooooooooxxxxxxxxxxxxxxxxxxxxxxxxx
+Three Original Videos, 20, 25, 35 frames: x = frame, o = no frame
+full, max_frames = 31 -> extract all frames (trimmed to the maximum length):
+video1: xxxxxxxxxxxxxxxxx (trimmed to 17 frames)
+video2: xxxxxxxxxxxxxxxxxxxxxxxxx (25 frames)
+video3: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx (trimmed to 31 frames)
 ```
+### Sample for Image Dataset with Control Images
+The dataset with control images. This is used for training the one frame training for FramePack.
+The dataset configuration with caption text files is similar to the image dataset, but with an additional `control_directory` parameter.
+The control images are used from the `control_directory` with the same filename (or different extension) as the image, for example, `image_dir/image1.jpg` and `control_dir/image1.png`. The images in `image_directory` should be the target images (the images to be generated during inference, the changed images). The `control_directory` should contain the starting images for inference. The captions should be stored in `image_directory`.
+If multiple control images are specified, the filenames of the control images should be numbered (excluding the extension). For example, specify `image_dir/image1.jpg` and `control_dir/image1_0.png`, `control_dir/image1_1.png`. You can also specify the numbers with four digits, such as `image1_0000.png`, `image1_0001.png`.
+The metadata JSONL file format is the same as the image dataset, but with an additional `control_path` parameter.
+```json
+{"image_path": "/path/to/image1.jpg", "control_path": "/path/to/control1.png", "caption": "A caption for image1"}
+{"image_path": "/path/to/image2.jpg", "control_path": "/path/to/control2.png", "caption": "A caption for image2"}
+If multiple control images are specified, the attribute names should be `control_path_0`, `control_path_1`, etc.
+```json
+{"image_path": "/path/to/image1.jpg", "control_path_0": "/path/to/control1_0.png", "control_path_1": "/path/to/control1_1.png", "caption": "A caption for image1"}
+{"image_path": "/path/to/image2.jpg", "control_path_0": "/path/to/control2_0.png", "control_path_1": "/path/to/control2_1.png", "caption": "A caption for image2"}
+```
+The control images can also have an alpha channel. In this case, the alpha channel of the image is used as a mask for the latent.
+<details>
+<summary>日本語</summary>
+制御画像を持つデータセットです。現時点ではFramePackの単一フレーム学習に使用します。
+キャプションファイルを用いる場合は`control_directory`を追加で指定してください。制御画像は、画像と同じファイル名（または拡張子のみが異なるファイル名）の、`control_directory`にある画像が使用されます（例：`image_dir/image1.jpg`と`control_dir/image1.png`）。`image_directory`の画像は学習対象の画像（推論時に生成する画像、変化後の画像）としてください。`control_directory`には推論時の開始画像を格納してください。キャプションは`image_directory`へ格納してください。
+複数枚の制御画像が指定可能です。この場合、制御画像のファイル名（拡張子を除く）へ数字を付与してください。例えば、`image_dir/image1.jpg`と`control_dir/image1_0.png`, `control_dir/image1_1.png`のように指定します。`image1_0000.png`, `image1_0001.png`のように数字を4桁で指定することもできます。
+メタデータJSONLファイルを使用する場合は、`control_path`を追加してください。複数枚の制御画像を指定する場合は、`control_path_0`, `control_path_1`のように数字を付与してください。
+制御画像はアルファチャンネルを持つこともできます。この場合、画像のアルファチャンネルはlatentへのマスクとして使用されます。
+</details>
+### Sample for Video Dataset with Control Images
+The dataset with control videos is used for training ControlNet models.
+The dataset configuration with caption text files is similar to the video dataset, but with an additional `control_directory` parameter.
+The control video for a video is used from the `control_directory` with the same filename (or different extension) as the video, for example, `video_dir/video1.mp4` and `control_dir/video1.mp4` or `control_dir/video1.mov`. The control video can also be a directory without an extension, for example, `video_dir/video1.mp4` and `control_dir/video1`.
+```toml
 [[datasets]]
 video_directory = "/path/to/video_dir"
+control_directory = "/path/to/control_dir" # required for dataset with control videos
+cache_directory = "/path/to/cache_directory" # recommended to set cache directory
+target_frames = [1, 25, 45]
+frame_extraction = "head"
+```
+The dataset configuration with metadata JSONL file is  same as the video dataset, but metadata JSONL file must include the control video paths. The control video path can be a directory containing multiple images.
+```json
+{"video_path": "/path/to/video1.mp4", "control_path": "/path/to/control1.mp4", "caption": "A caption for video1"}
+{"video_path": "/path/to/video2.mp4", "control_path": "/path/to/control2.mp4", "caption": "A caption for video2"}
+```
+<details>
+<summary>日本語</summary>
+制御動画を持つデータセットです。ControlNetモデルの学習に使用します。
+キャプションを用いる場合のデータセット設定は動画データセットと似ていますが、`control_directory`パラメータが追加されています。上にある例を参照してください。ある動画に対する制御用動画として、動画と同じファイル名（または拡張子のみが異なるファイル名）の、`control_directory`にある動画が使用されます（例：`video_dir/video1.mp4`と`control_dir/video1.mp4`または`control_dir/video1.mov`）。また、拡張子なしのディレクトリ内の、複数枚の画像を制御用動画として使用することもできます（例：`video_dir/video1.mp4`と`control_dir/video1`）。
+データセット設定でメタデータJSONLファイルを使用する場合は、動画と制御用動画のパスを含める必要があります。制御用動画のパスは、複数枚の画像を含むディレクトリのパスでも構いません。
+</details>
+## Architecture-specific Settings / アーキテクチャ固有の設定
+The dataset configuration is shared across all architectures. However, some architectures may require additional settings or have specific requirements for the dataset.
+### FramePack
+For FramePack, you can set the latent window size for training. It is recommended to set it to 9 for FramePack training. The default value is 9, so you can usually omit this setting.
+```toml
 [[datasets]]
+fp_latent_window_size = 9
+```
+<details>
+<summary>日本語</summary>
+学習時のlatent window sizeを指定できます。FramePackの学習においては、9を指定することを推奨します。省略時は9が使用されますので、通常は省略して構いません。
+</details>
+### FramePack One Frame Training
+For the default one frame training of FramePack, you need to set the following parameters in the dataset configuration:
+```toml
+[[datasets]]
+fp_1f_clean_indices = [0]
+fp_1f_target_index = 9
+fp_1f_no_post = false
 ```
+**Advanced Settings:**
+**Note that these parameters are still experimental, and the optimal values are not yet known.** The parameters may also change in the future.
+`fp_1f_clean_indices` sets the `clean_indices` value passed to the FramePack model. You can specify multiple indices. `fp_1f_target_index` sets the index of the frame to be trained (generated). `fp_1f_no_post` sets whether to add a zero value as `clean_latent_post`, default is `false` (add zero value).
+The number of control images should match the number of indices specified in `fp_1f_clean_indices`.
+The default values mean that the first image (control image) is at index `0`, and the target image (the changed image) is at index `9`.
+For training with 1f-mc, set `fp_1f_clean_indices` to `[0, 1]` and `fp_1f_target_index` to `9` (or another value). This allows you to use multiple control images to train a single generated image. The control images will be two in this case.
+```toml
 [[datasets]]
+fp_1f_clean_indices = [0, 1]
+fp_1f_target_index = 9
+fp_1f_no_post = false
+```
+For training with kisekaeichi, set `fp_1f_clean_indices` to `[0, 10]` and `fp_1f_target_index` to `1` (or another value). This allows you to use the starting image (the image just before the generation section) and the image following the generation section (equivalent to `clean_latent_post`) to train the first image of the generated video. The control images will be two in this case. `fp_1f_no_post` should be set to `true`.
+```toml
+[[datasets]]
+fp_1f_clean_indices = [0, 10]
+fp_1f_target_index = 1
+fp_1f_no_post = true
+```
+With `fp_1f_clean_indices` and `fp_1f_target_index`, you can specify any number of control images and any index of the target image for training.
+If you set `fp_1f_no_post` to `false`, the `clean_latent_post_index` will be `1 + fp1_latent_window_size`.
+You can also set the `no_2x` and `no_4x` options for cache scripts to disable the clean latents 2x and 4x.
+The 2x indices are `1 + fp1_latent_window_size + 1` for two indices (usually `11, 12`), and the 4x indices are `1 + fp1_latent_window_size + 1 + 2` for sixteen indices (usually `13, 14, ..., 28`), regardless of `fp_1f_no_post` and `no_2x`, `no_4x` settings.
+<details>
+<summary>日本語</summary>
+※ **以下のパラメータは研究中で最適値はまだ不明です。** またパラメータ自体も変更される可能性があります。
+デフォルトの1フレーム学習を行う場合、`fp_1f_clean_indices`に`[0]`を、`fp_1f_target_index`に`9`（または5から15程度の値）を、`no_post`に`false`を設定してください。（記述例は英語版ドキュメントを参照、以降同じ。）
+**より高度な設定：**
+`fp_1f_clean_indices`は、FramePackモデルに渡される `clean_indices` の値を設定します。複数指定が可能です。`fp_1f_target_index`は、学習（生成）対象のフレームのインデックスを設定します。`fp_1f_no_post`は、`clean_latent_post` をゼロ値で追加するかどうかを設定します（デフォルトは`false`で、ゼロ値で追加します）。
+制御画像の枚数は`fp_1f_clean_indices`に指定したインデックスの数とあわせてください。
+デフォルトの1フレーム学習では、開始画像（制御画像）1枚をインデックス`0`、生成対象の画像（変化後の画像）をインデックス`9`に設定しています。
+1f-mcの学習を行う場合は、`fp_1f_clean_indices`に `[0, 1]`を、`fp_1f_target_index`に`9`を設定してください。これにより動画の先頭の2枚の制御画像を使用して、後続の1枚の生成画像を学習します。制御画像は2枚になります。
+kisekaeichiの学習を行う場合は、`fp_1f_clean_indices`に `[0, 10]`を、`fp_1f_target_index`に`1`（または他の値）を設定してください。これは、開始画像（生成セクションの直前の画像）（`clean_latent_pre`に相当）と、生成セクションに続く1枚の画像（`clean_latent_post`に相当）を使用して、生成動画の先頭の画像（`target_index=1`）を学習します。制御画像は2枚になります。`f1_1f_no_post`は`true`に設定してください。
+`fp_1f_clean_indices`と`fp_1f_target_index`を応用することで、任意の枚数の制御画像を、任意のインデックスを指定して学習することが可能です。
+`fp_1f_no_post`を`false`に設定すると、`clean_latent_post_index`は `1 + fp1_latent_window_size` になります。
+推論時の `no_2x`、`no_4x`に対応する設定は、キャッシュスクリプトの引数で行えます。なお、2xのindexは `1 + fp1_latent_window_size + 1` からの2個（通常は`11, 12`）、4xのindexは `1 + fp1_latent_window_size + 1 + 2` からの16個になります（通常は`13, 14, ..., 28`）です。これらの値は`fp_1f_no_post`や`no_2x`, `no_4x`の設定に関わらず、常に同じです。
+</details>
+## Specifications
 ```toml
 # general configurations
 [general]
+resolution = [960, 544] # optional, [W, H], default is [960, 544]. This is the default resolution for all datasets
 caption_extension = ".txt" # optional, default is None. This is the default caption extension for all datasets
 batch_size = 1 # optional, default is 1. This is the default batch size for all datasets
+num_repeats = 1 # optional, default is 1. Number of times to repeat the dataset. Useful to balance the multiple datasets with different sizes.
 enable_bucket = true # optional, default is false. Enable bucketing for datasets
 bucket_no_upscale = false # optional, default is false. Disable upscaling for bucketing. Ignored if enable_bucket is false
+### Image Dataset
 # sample image dataset with caption text files
 [[datasets]]
 image_directory = "/path/to/image_dir"
 caption_extension = ".txt" # required for caption text files, if general caption extension is not set
 resolution = [960, 544] # required if general resolution is not set
 batch_size = 4 # optional, overwrite the default batch size
+num_repeats = 1 # optional, overwrite the default num_repeats
 enable_bucket = false # optional, overwrite the default bucketing setting
 bucket_no_upscale = true # optional, overwrite the default bucketing setting
 cache_directory = "/path/to/cache_directory" # optional, default is None to use the same directory as the image directory. NOTE: caching is always enabled
+control_directory = "/path/to/control_dir" # optional, required for dataset with control images
 # sample image dataset with metadata **jsonl** file
 [[datasets]]
 resolution = [960, 544] # required if general resolution is not set
 cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
 # caption_extension is not required for metadata jsonl file
+# batch_size, num_repeats, enable_bucket, bucket_no_upscale are also available for metadata jsonl file
+### Video Dataset
 # sample video dataset with caption text files
 [[datasets]]
 video_directory = "/path/to/video_dir"
 caption_extension = ".txt" # required for caption text files, if general caption extension is not set
 resolution = [960, 544] # required if general resolution is not set
+control_directory = "/path/to/control_dir" # optional, required for dataset with control images
+# following configurations must be set in each [[datasets]] section for video datasets
 target_frames = [1, 25, 79] # required for video dataset. list of video lengths to extract frames. each element must be N*4+1 (N=0,1,2,...)
+# NOTE: Please do not include 1 in target_frames if you are using the frame_extraction "chunk". It will make the all frames to be extracted.
 frame_extraction = "head" # optional, "head" or "chunk", "slide", "uniform". Default is "head"
 frame_stride = 1 # optional, default is 1, available for "slide" frame extraction
 frame_sample = 4 # optional, default is 1 (same as "head"), available for "uniform" frame extraction
+max_frames = 129 # optional, default is 129. Maximum number of frames to extract, available for "full" frame extraction
+# batch_size, num_repeats, enable_bucket, bucket_no_upscale, cache_directory are also available for video dataset
 # sample video dataset with metadata jsonl file
 [[datasets]]
 video_jsonl_file = "/path/to/metadata.jsonl" # includes pairs of video files and captions
 target_frames = [1, 79]
 cache_directory = "/path/to/cache_directory" # required for metadata jsonl file
+# frame_extraction, frame_stride, frame_sample, max_frames are also available for metadata jsonl file
 ```
+<!--
 # sample image dataset with lance
 [[datasets]]
 image_lance_dataset = "/path/to/lance_dataset"
 resolution = [960, 544] # required if general resolution is not set
 # batch_size, enable_bucket, bucket_no_upscale, cache_directory are also available for lance dataset
+-->
 The metadata with .json file will be supported in the near future.

dataset/image_video_dataset.py CHANGED Viewed

@@ -5,7 +5,7 @@ import math
 import os
 import random
 import time
-from typing import Optional, Sequence, Tuple, Union
 import numpy as np
 import torch
@@ -76,6 +76,8 @@ ARCHITECTURE_HUNYUAN_VIDEO = "hv"
 ARCHITECTURE_HUNYUAN_VIDEO_FULL = "hunyuan_video"
 ARCHITECTURE_WAN = "wan"
 ARCHITECTURE_WAN_FULL = "wan"
 def glob_images(directory, base="*"):
@@ -109,6 +111,8 @@ def divisible_by(num: int, divisor: int) -> int:
 def resize_image_to_bucket(image: Union[Image.Image, np.ndarray], bucket_reso: tuple[int, int]) -> np.ndarray:
     """
     Resize the image to the bucket resolution.
     """
     is_pil_image = isinstance(image, Image.Image)
     if is_pil_image:
@@ -120,23 +124,21 @@ def resize_image_to_bucket(image: Union[Image.Image, np.ndarray], bucket_reso: t
         return np.array(image) if is_pil_image else image
     bucket_width, bucket_height = bucket_reso
-    if bucket_width == image_width or bucket_height == image_height:
-        image = np.array(image) if is_pil_image else image
     else:
-        # resize the image to the bucket resolution to match the short side
-        scale_width = bucket_width / image_width
-        scale_height = bucket_height / image_height
-        scale = max(scale_width, scale_height)
-        image_width = int(image_width * scale + 0.5)
-        image_height = int(image_height * scale + 0.5)
-        if scale > 1:
-            image = Image.fromarray(image) if not is_pil_image else image
-            image = image.resize((image_width, image_height), Image.LANCZOS)
-            image = np.array(image)
-        else:
-            image = np.array(image) if is_pil_image else image
-            image = cv2.resize(image, (image_width, image_height), interpolation=cv2.INTER_AREA)
     # crop the image to the bucket resolution
     crop_left = (image_width - bucket_width) // 2
@@ -151,7 +153,7 @@ class ItemInfo:
         item_key: str,
         caption: str,
         original_size: tuple[int, int],
-        bucket_size: Optional[Union[tuple[int, int], tuple[int, int, int]]] = None,
         frame_count: Optional[int] = None,
         content: Optional[np.ndarray] = None,
         latent_cache_path: Optional[str] = None,
@@ -165,11 +167,20 @@ class ItemInfo:
         self.latent_cache_path = latent_cache_path
         self.text_encoder_output_cache_path: Optional[str] = None
     def __str__(self) -> str:
         return (
             f"ItemInfo(item_key={self.item_key}, caption={self.caption}, "
             + f"original_size={self.original_size}, bucket_size={self.bucket_size}, "
-            + f"frame_count={self.frame_count}, latent_cache_path={self.latent_cache_path})"
         )
@@ -181,7 +192,7 @@ class ItemInfo:
 def save_latent_cache(item_info: ItemInfo, latent: torch.Tensor):
-    """HunyuanVideo architecture only"""
     assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
     _, F, H, W = latent.shape
@@ -192,7 +203,11 @@ def save_latent_cache(item_info: ItemInfo, latent: torch.Tensor):
 def save_latent_cache_wan(
-    item_info: ItemInfo, latent: torch.Tensor, clip_embed: Optional[torch.Tensor], image_latent: Optional[torch.Tensor]
 ):
     """Wan architecture only"""
     assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
@@ -207,9 +222,51 @@ def save_latent_cache_wan(
     if image_latent is not None:
         sd[f"latents_image_{F}x{H}x{W}_{dtype_str}"] = image_latent.detach().cpu()
     save_latent_cache_common(item_info, sd, ARCHITECTURE_WAN_FULL)
 def save_latent_cache_common(item_info: ItemInfo, sd: dict[str, torch.Tensor], arch_fullname: str):
     metadata = {
         "architecture": arch_fullname,
@@ -260,6 +317,20 @@ def save_text_encoder_output_cache_wan(item_info: ItemInfo, embed: torch.Tensor)
     save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_WAN_FULL)
 def save_text_encoder_output_cache_common(item_info: ItemInfo, sd: dict[str, torch.Tensor], arch_fullname: str):
     for key, value in sd.items():
         # NaN check and show warning, replace NaN with 0
@@ -299,6 +370,7 @@ def save_text_encoder_output_cache_common(item_info: ItemInfo, sd: dict[str, tor
 class BucketSelector:
     RESOLUTION_STEPS_HUNYUAN = 16
     RESOLUTION_STEPS_WAN = 16
     def __init__(
         self, resolution: Tuple[int, int], enable_bucket: bool = True, no_upscale: bool = False, architecture: str = "no_default"
@@ -311,6 +383,8 @@ class BucketSelector:
             self.reso_steps = BucketSelector.RESOLUTION_STEPS_HUNYUAN
         elif self.architecture == ARCHITECTURE_WAN:
             self.reso_steps = BucketSelector.RESOLUTION_STEPS_WAN
         else:
             raise ValueError(f"Invalid architecture: {self.architecture}")
@@ -358,48 +432,142 @@ def load_video(
     end_frame: Optional[int] = None,
     bucket_selector: Optional[BucketSelector] = None,
     bucket_reso: Optional[tuple[int, int]] = None,
 ) -> list[np.ndarray]:
     """
     bucket_reso: if given, resize the video to the bucket resolution, (width, height)
     """
-    container = av.open(video_path)
-    video = []
-    for i, frame in enumerate(container.decode(video=0)):
-        if start_frame is not None and i < start_frame:
-            continue
-        if end_frame is not None and i >= end_frame:
-            break
-        frame = frame.to_image()
-        if bucket_selector is not None and bucket_reso is None:
-            bucket_reso = bucket_selector.get_bucket_resolution(frame.size)
-        if bucket_reso is not None:
-            frame = resize_image_to_bucket(frame, bucket_reso)
         else:
-            frame = np.array(frame)
-        video.append(frame)
-    container.close()
     return video
 class BucketBatchManager:
-    def __init__(self, bucketed_item_info: dict[tuple[int, int], list[ItemInfo]], batch_size: int):
         self.batch_size = batch_size
         self.buckets = bucketed_item_info
         self.bucket_resos = list(self.buckets.keys())
         self.bucket_resos.sort()
-        self.bucket_batch_indices = []
         for bucket_reso in self.bucket_resos:
             bucket = self.buckets[bucket_reso]
             num_batches = math.ceil(len(bucket) / self.batch_size)
             for i in range(num_batches):
                 self.bucket_batch_indices.append((bucket_reso, i))
-        self.shuffle()
     def show_bucket_info(self):
         for bucket_reso in self.bucket_resos:
@@ -409,8 +577,11 @@ class BucketBatchManager:
         logger.info(f"total batches: {len(self)}")
     def shuffle(self):
         for bucket in self.buckets.values():
             random.shuffle(bucket)
         random.shuffle(self.bucket_batch_indices)
     def __len__(self):
@@ -460,7 +631,8 @@ class BucketBatchManager:
 class ContentDatasource:
     def __init__(self):
-        self.caption_only = False
     def set_caption_only(self, caption_only: bool):
         self.caption_only = caption_only
@@ -498,10 +670,18 @@ class ImageDatasource(ContentDatasource):
 class ImageDirectoryDatasource(ImageDatasource):
-    def __init__(self, image_directory: str, caption_extension: Optional[str] = None):
         super().__init__()
         self.image_directory = image_directory
         self.caption_extension = caption_extension
         self.current_idx = 0
         # glob images
@@ -509,19 +689,68 @@ class ImageDirectoryDatasource(ImageDatasource):
         self.image_paths = glob_images(self.image_directory)
         logger.info(f"found {len(self.image_paths)} images")
     def is_indexable(self):
         return True
     def __len__(self):
         return len(self.image_paths)
-    def get_image_data(self, idx: int) -> tuple[str, Image.Image, str]:
         image_path = self.image_paths[idx]
         image = Image.open(image_path).convert("RGB")
         _, caption = self.get_caption(idx)
-        return image_path, image, caption
     def get_caption(self, idx: int) -> tuple[str, str]:
         image_path = self.image_paths[idx]
@@ -559,9 +788,10 @@ class ImageDirectoryDatasource(ImageDatasource):
 class ImageJsonlDatasource(ImageDatasource):
-    def __init__(self, image_jsonl_file: str):
         super().__init__()
         self.image_jsonl_file = image_jsonl_file
         self.current_idx = 0
         # load jsonl
@@ -577,20 +807,55 @@ class ImageJsonlDatasource(ImageDatasource):
                 self.data.append(data)
         logger.info(f"loaded {len(self.data)} images")
     def is_indexable(self):
         return True
     def __len__(self):
         return len(self.data)
-    def get_image_data(self, idx: int) -> tuple[str, Image.Image, str]:
         data = self.data[idx]
         image_path = data["image_path"]
         image = Image.open(image_path).convert("RGB")
         caption = data["caption"]
-        return image_path, image, caption
     def get_caption(self, idx: int) -> tuple[str, str]:
         data = self.data[idx]
@@ -634,6 +899,9 @@ class VideoDatasource(ContentDatasource):
         self.bucket_selector = None
     def __len__(self):
         raise NotImplementedError
@@ -650,9 +918,27 @@ class VideoDatasource(ContentDatasource):
         end_frame = end_frame if end_frame is not None else self.end_frame
         bucket_selector = bucket_selector if bucket_selector is not None else self.bucket_selector
-        video = load_video(video_path, start_frame, end_frame, bucket_selector)
         return video
     def set_start_and_end_frame(self, start_frame: Optional[int], end_frame: Optional[int]):
         self.start_frame = start_frame
         self.end_frame = end_frame
@@ -660,6 +946,10 @@ class VideoDatasource(ContentDatasource):
     def set_bucket_selector(self, bucket_selector: BucketSelector):
         self.bucket_selector = bucket_selector
     def __iter__(self):
         raise NotImplementedError
@@ -668,17 +958,58 @@ class VideoDatasource(ContentDatasource):
 class VideoDirectoryDatasource(VideoDatasource):
-    def __init__(self, video_directory: str, caption_extension: Optional[str] = None):
         super().__init__()
         self.video_directory = video_directory
         self.caption_extension = caption_extension
         self.current_idx = 0
-        # glob images
-        logger.info(f"glob images in {self.video_directory}")
         self.video_paths = glob_videos(self.video_directory)
         logger.info(f"found {len(self.video_paths)} videos")
     def is_indexable(self):
         return True
@@ -691,13 +1022,18 @@ class VideoDirectoryDatasource(VideoDatasource):
         start_frame: Optional[int] = None,
         end_frame: Optional[int] = None,
         bucket_selector: Optional[BucketSelector] = None,
-    ) -> tuple[str, list[Image.Image], str]:
         video_path = self.video_paths[idx]
         video = self.get_video_data_from_path(video_path, start_frame, end_frame, bucket_selector)
         _, caption = self.get_caption(idx)
-        return video_path, video, caption
     def get_caption(self, idx: int) -> tuple[str, str]:
         video_path = self.video_paths[idx]
@@ -747,6 +1083,16 @@ class VideoJsonlDatasource(VideoDatasource):
                 self.data.append(data)
         logger.info(f"loaded {len(self.data)} videos")
     def is_indexable(self):
         return True
@@ -759,14 +1105,19 @@ class VideoJsonlDatasource(VideoDatasource):
         start_frame: Optional[int] = None,
         end_frame: Optional[int] = None,
         bucket_selector: Optional[BucketSelector] = None,
-    ) -> tuple[str, list[Image.Image], str]:
         data = self.data[idx]
         video_path = data["video_path"]
         video = self.get_video_data_from_path(video_path, start_frame, end_frame, bucket_selector)
         caption = data["caption"]
-        return video_path, video, caption
     def get_caption(self, idx: int) -> tuple[str, str]:
         data = self.data[idx]
@@ -973,7 +1324,12 @@ class ImageDataset(BaseDataset):
         bucket_no_upscale: bool,
         image_directory: Optional[str] = None,
         image_jsonl_file: Optional[str] = None,
         cache_directory: Optional[str] = None,
         debug_dataset: bool = False,
         architecture: str = "no_default",
     ):
@@ -990,10 +1346,22 @@ class ImageDataset(BaseDataset):
         )
         self.image_directory = image_directory
         self.image_jsonl_file = image_jsonl_file
         if image_directory is not None:
-            self.datasource = ImageDirectoryDatasource(image_directory, caption_extension)
         elif image_jsonl_file is not None:
-            self.datasource = ImageJsonlDatasource(image_jsonl_file)
         else:
             raise ValueError("image_directory or image_jsonl_file must be specified")
@@ -1002,6 +1370,7 @@ class ImageDataset(BaseDataset):
         self.batch_manager = None
         self.num_train_items = 0
     def get_metadata(self):
         metadata = super().get_metadata()
@@ -1009,6 +1378,9 @@ class ImageDataset(BaseDataset):
             metadata["image_directory"] = os.path.basename(self.image_directory)
         if self.image_jsonl_file is not None:
             metadata["image_jsonl_file"] = os.path.basename(self.image_jsonl_file)
         return metadata
     def get_total_image_count(self):
@@ -1033,12 +1405,27 @@ class ImageDataset(BaseDataset):
                         break  # submit batch if possible
                 for future in completed_futures:
-                    original_size, item_key, image, caption = future.result()
                     bucket_height, bucket_width = image.shape[:2]
                     bucket_reso = (bucket_width, bucket_height)
                     item_info = ItemInfo(item_key, caption, original_size, bucket_reso, content=image)
                     item_info.latent_cache_path = self.get_latent_cache_path(item_info)
                     if bucket_reso not in batches:
                         batches[bucket_reso] = []
@@ -1061,14 +1448,21 @@ class ImageDataset(BaseDataset):
         for fetch_op in self.datasource:
             # fetch and resize image in a separate thread
-            def fetch_and_resize(op: callable) -> tuple[tuple[int, int], str, Image.Image, str]:
-                image_key, image, caption = op()
                 image: Image.Image
                 image_size = image.size
                 bucket_reso = buckset_selector.get_bucket_resolution(image_size)
-                image = resize_image_to_bucket(image, bucket_reso)
-                return image_size, image_key, image, caption
             future = executor.submit(fetch_and_resize, fetch_op)
             futures.append(future)
@@ -1113,6 +1507,15 @@ class ImageDataset(BaseDataset):
                 continue
             bucket_reso = bucket_selector.get_bucket_resolution(image_size)
             item_info = ItemInfo(item_key, "", image_size, bucket_reso, latent_cache_path=cache_file)
             item_info.text_encoder_output_cache_path = text_encoder_output_cache_file
@@ -1142,6 +1545,10 @@ class ImageDataset(BaseDataset):
 class VideoDataset(BaseDataset):
     def __init__(
         self,
         resolution: Tuple[int, int],
@@ -1154,9 +1561,13 @@ class VideoDataset(BaseDataset):
         frame_stride: Optional[int] = 1,
         frame_sample: Optional[int] = 1,
         target_frames: Optional[list[int]] = None,
         video_directory: Optional[str] = None,
         video_jsonl_file: Optional[str] = None,
         cache_directory: Optional[str] = None,
         debug_dataset: bool = False,
         architecture: str = "no_default",
     ):
@@ -1173,13 +1584,42 @@ class VideoDataset(BaseDataset):
         )
         self.video_directory = video_directory
         self.video_jsonl_file = video_jsonl_file
-        self.target_frames = target_frames
         self.frame_extraction = frame_extraction
         self.frame_stride = frame_stride
         self.frame_sample = frame_sample
         if video_directory is not None:
-            self.datasource = VideoDirectoryDatasource(video_directory, caption_extension)
         elif video_jsonl_file is not None:
             self.datasource = VideoJsonlDatasource(video_jsonl_file)
@@ -1195,6 +1635,7 @@ class VideoDataset(BaseDataset):
         self.batch_manager = None
         self.num_train_items = 0
     def get_metadata(self):
         metadata = super().get_metadata()
@@ -1202,20 +1643,29 @@ class VideoDataset(BaseDataset):
             metadata["video_directory"] = os.path.basename(self.video_directory)
         if self.video_jsonl_file is not None:
             metadata["video_jsonl_file"] = os.path.basename(self.video_jsonl_file)
         metadata["frame_extraction"] = self.frame_extraction
         metadata["frame_stride"] = self.frame_stride
         metadata["frame_sample"] = self.frame_sample
         metadata["target_frames"] = self.target_frames
         return metadata
     def retrieve_latent_cache_batches(self, num_workers: int):
         buckset_selector = BucketSelector(self.resolution, architecture=self.architecture)
         self.datasource.set_bucket_selector(buckset_selector)
         executor = ThreadPoolExecutor(max_workers=num_workers)
-        # key: (width, height, frame_count), value: [ItemInfo]
-        batches: dict[tuple[int, int, int], list[ItemInfo]] = {}
         futures = []
         def aggregate_future(consume_all: bool = False):
@@ -1229,13 +1679,25 @@ class VideoDataset(BaseDataset):
                         break  # submit batch if possible
                 for future in completed_futures:
-                    original_frame_size, video_key, video, caption = future.result()
                     frame_count = len(video)
                     video = np.stack(video, axis=0)
                     height, width = video.shape[1:3]
                     bucket_reso = (width, height)  # already resized
                     crop_pos_and_frames = []
                     if self.frame_extraction == "head":
                         for target_frame in self.target_frames:
@@ -1260,6 +1722,11 @@ class VideoDataset(BaseDataset):
                                 frame_indices = np.linspace(0, frame_count - target_frame, self.frame_sample, dtype=int)
                                 for i in frame_indices:
                                     crop_pos_and_frames.append((i, target_frame))
                     else:
                         raise ValueError(f"frame_extraction {self.frame_extraction} is not supported")
@@ -1269,10 +1736,21 @@ class VideoDataset(BaseDataset):
                         item_key = f"{body}_{crop_pos:05d}-{target_frame:03d}{ext}"
                         batch_key = (*bucket_reso, target_frame)  # bucket_reso with frame_count
                         item_info = ItemInfo(
                             item_key, caption, original_frame_size, batch_key, frame_count=target_frame, content=cropped_video
                         )
                         item_info.latent_cache_path = self.get_latent_cache_path(item_info)
                         batch = batches.get(batch_key, [])
                         batch.append(item_info)
@@ -1293,8 +1771,15 @@ class VideoDataset(BaseDataset):
         for operator in self.datasource:
-            def fetch_and_resize(op: callable) -> tuple[tuple[int, int], str, list[np.ndarray], str]:
-                video_key, video, caption = op()
                 video: list[np.ndarray]
                 frame_size = (video[0].shape[1], video[0].shape[0])
@@ -1302,7 +1787,11 @@ class VideoDataset(BaseDataset):
                 bucket_reso = buckset_selector.get_bucket_resolution(frame_size)
                 video = [resize_image_to_bucket(frame, bucket_reso) for frame in video]
-                return frame_size, video_key, video, caption
             future = executor.submit(fetch_and_resize, operator)
             futures.append(future)
@@ -1340,7 +1829,7 @@ class VideoDataset(BaseDataset):
             image_width, image_height = map(int, image_size.split("x"))
             image_size = (image_width, image_height)
-            frame_pos, frame_count = tokens[-3].split("-")
             frame_pos, frame_count = int(frame_pos), int(frame_count)
             item_key = "_".join(tokens[:-3])

 import os
 import random
 import time
+from typing import Any, Optional, Sequence, Tuple, Union
 import numpy as np
 import torch
 ARCHITECTURE_HUNYUAN_VIDEO_FULL = "hunyuan_video"
 ARCHITECTURE_WAN = "wan"
 ARCHITECTURE_WAN_FULL = "wan"
+ARCHITECTURE_FRAMEPACK = "fp"
+ARCHITECTURE_FRAMEPACK_FULL = "framepack"
 def glob_images(directory, base="*"):
 def resize_image_to_bucket(image: Union[Image.Image, np.ndarray], bucket_reso: tuple[int, int]) -> np.ndarray:
     """
     Resize the image to the bucket resolution.
+    bucket_reso: **(width, height)**
     """
     is_pil_image = isinstance(image, Image.Image)
     if is_pil_image:
         return np.array(image) if is_pil_image else image
     bucket_width, bucket_height = bucket_reso
+    # resize the image to the bucket resolution to match the short side
+    scale_width = bucket_width / image_width
+    scale_height = bucket_height / image_height
+    scale = max(scale_width, scale_height)
+    image_width = int(image_width * scale + 0.5)
+    image_height = int(image_height * scale + 0.5)
+    if scale > 1:
+        image = Image.fromarray(image) if not is_pil_image else image
+        image = image.resize((image_width, image_height), Image.LANCZOS)
+        image = np.array(image)
     else:
+        image = np.array(image) if is_pil_image else image
+        image = cv2.resize(image, (image_width, image_height), interpolation=cv2.INTER_AREA)
     # crop the image to the bucket resolution
     crop_left = (image_width - bucket_width) // 2
         item_key: str,
         caption: str,
         original_size: tuple[int, int],
+        bucket_size: Optional[tuple[Any]] = None,
         frame_count: Optional[int] = None,
         content: Optional[np.ndarray] = None,
         latent_cache_path: Optional[str] = None,
         self.latent_cache_path = latent_cache_path
         self.text_encoder_output_cache_path: Optional[str] = None
+        # np.ndarray for video, list[np.ndarray] for image with multiple controls
+        self.control_content: Optional[Union[np.ndarray, list[np.ndarray]]] = None
+        # FramePack architecture specific
+        self.fp_latent_window_size: Optional[int] = None
+        self.fp_1f_clean_indices: Optional[list[int]] = None  # indices of clean latents for 1f
+        self.fp_1f_target_index: Optional[int] = None  # target index for 1f clean latents
+        self.fp_1f_no_post: Optional[bool] = None  # whether to add zero values as clean latent post
     def __str__(self) -> str:
         return (
             f"ItemInfo(item_key={self.item_key}, caption={self.caption}, "
             + f"original_size={self.original_size}, bucket_size={self.bucket_size}, "
+            + f"frame_count={self.frame_count}, latent_cache_path={self.latent_cache_path}, content={self.content.shape if self.content is not None else None})"
         )
 def save_latent_cache(item_info: ItemInfo, latent: torch.Tensor):
+    """HunyuanVideo architecture only. HunyuanVideo doesn't support I2V and control latents"""
     assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
     _, F, H, W = latent.shape
 def save_latent_cache_wan(
+    item_info: ItemInfo,
+    latent: torch.Tensor,
+    clip_embed: Optional[torch.Tensor],
+    image_latent: Optional[torch.Tensor],
+    control_latent: Optional[torch.Tensor],
 ):
     """Wan architecture only"""
     assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
     if image_latent is not None:
         sd[f"latents_image_{F}x{H}x{W}_{dtype_str}"] = image_latent.detach().cpu()
+    if control_latent is not None:
+        sd[f"latents_control_{F}x{H}x{W}_{dtype_str}"] = control_latent.detach().cpu()
     save_latent_cache_common(item_info, sd, ARCHITECTURE_WAN_FULL)
+def save_latent_cache_framepack(
+    item_info: ItemInfo,
+    latent: torch.Tensor,
+    latent_indices: torch.Tensor,
+    clean_latents: torch.Tensor,
+    clean_latent_indices: torch.Tensor,
+    clean_latents_2x: torch.Tensor,
+    clean_latent_2x_indices: torch.Tensor,
+    clean_latents_4x: torch.Tensor,
+    clean_latent_4x_indices: torch.Tensor,
+    image_embeddings: torch.Tensor,
+):
+    """FramePack architecture only"""
+    assert latent.dim() == 4, "latent should be 4D tensor (frame, channel, height, width)"
+    _, F, H, W = latent.shape
+    dtype_str = dtype_to_str(latent.dtype)
+    sd = {f"latents_{F}x{H}x{W}_{dtype_str}": latent.detach().cpu().contiguous()}
+    # `latents_xxx` must have {F, H, W} suffix
+    indices_dtype_str = dtype_to_str(latent_indices.dtype)
+    sd[f"image_embeddings_{dtype_str}"] = image_embeddings.detach().cpu()  # image embeddings dtype is same as latents dtype
+    sd[f"latent_indices_{indices_dtype_str}"] = latent_indices.detach().cpu()
+    sd[f"clean_latent_indices_{indices_dtype_str}"] = clean_latent_indices.detach().cpu()
+    sd[f"latents_clean_{F}x{H}x{W}_{dtype_str}"] = clean_latents.detach().cpu().contiguous()
+    if clean_latent_2x_indices is not None:
+        sd[f"clean_latent_2x_indices_{indices_dtype_str}"] = clean_latent_2x_indices.detach().cpu()
+    if clean_latents_2x is not None:
+        sd[f"latents_clean_2x_{F}x{H}x{W}_{dtype_str}"] = clean_latents_2x.detach().cpu().contiguous()
+    if clean_latent_4x_indices is not None:
+        sd[f"clean_latent_4x_indices_{indices_dtype_str}"] = clean_latent_4x_indices.detach().cpu()
+    if clean_latents_4x is not None:
+        sd[f"latents_clean_4x_{F}x{H}x{W}_{dtype_str}"] = clean_latents_4x.detach().cpu().contiguous()
+    # for key, value in sd.items():
+    #     print(f"{key}: {value.shape}")
+    save_latent_cache_common(item_info, sd, ARCHITECTURE_FRAMEPACK_FULL)
 def save_latent_cache_common(item_info: ItemInfo, sd: dict[str, torch.Tensor], arch_fullname: str):
     metadata = {
         "architecture": arch_fullname,
     save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_WAN_FULL)
+def save_text_encoder_output_cache_framepack(
+    item_info: ItemInfo, llama_vec: torch.Tensor, llama_attention_mask: torch.Tensor, clip_l_pooler: torch.Tensor
+):
+    """FramePack architecture only."""
+    sd = {}
+    dtype_str = dtype_to_str(llama_vec.dtype)
+    sd[f"llama_vec_{dtype_str}"] = llama_vec.detach().cpu()
+    sd[f"llama_attention_mask"] = llama_attention_mask.detach().cpu()
+    dtype_str = dtype_to_str(clip_l_pooler.dtype)
+    sd[f"clip_l_pooler_{dtype_str}"] = clip_l_pooler.detach().cpu()
+    save_text_encoder_output_cache_common(item_info, sd, ARCHITECTURE_FRAMEPACK_FULL)
 def save_text_encoder_output_cache_common(item_info: ItemInfo, sd: dict[str, torch.Tensor], arch_fullname: str):
     for key, value in sd.items():
         # NaN check and show warning, replace NaN with 0
 class BucketSelector:
     RESOLUTION_STEPS_HUNYUAN = 16
     RESOLUTION_STEPS_WAN = 16
+    RESOLUTION_STEPS_FRAMEPACK = 16
     def __init__(
         self, resolution: Tuple[int, int], enable_bucket: bool = True, no_upscale: bool = False, architecture: str = "no_default"
             self.reso_steps = BucketSelector.RESOLUTION_STEPS_HUNYUAN
         elif self.architecture == ARCHITECTURE_WAN:
             self.reso_steps = BucketSelector.RESOLUTION_STEPS_WAN
+        elif self.architecture == ARCHITECTURE_FRAMEPACK:
+            self.reso_steps = BucketSelector.RESOLUTION_STEPS_FRAMEPACK
         else:
             raise ValueError(f"Invalid architecture: {self.architecture}")
     end_frame: Optional[int] = None,
     bucket_selector: Optional[BucketSelector] = None,
     bucket_reso: Optional[tuple[int, int]] = None,
+    source_fps: Optional[float] = None,
+    target_fps: Optional[float] = None,
 ) -> list[np.ndarray]:
     """
     bucket_reso: if given, resize the video to the bucket resolution, (width, height)
     """
+    if source_fps is None or target_fps is None:
+        if os.path.isfile(video_path):
+            container = av.open(video_path)
+            video = []
+            for i, frame in enumerate(container.decode(video=0)):
+                if start_frame is not None and i < start_frame:
+                    continue
+                if end_frame is not None and i >= end_frame:
+                    break
+                frame = frame.to_image()
+                if bucket_selector is not None and bucket_reso is None:
+                    bucket_reso = bucket_selector.get_bucket_resolution(frame.size)  # calc resolution from first frame
+                if bucket_reso is not None:
+                    frame = resize_image_to_bucket(frame, bucket_reso)
+                else:
+                    frame = np.array(frame)
+                video.append(frame)
+            container.close()
         else:
+            # load images in the directory
+            image_files = glob_images(video_path)
+            image_files.sort()
+            video = []
+            for i in range(len(image_files)):
+                if start_frame is not None and i < start_frame:
+                    continue
+                if end_frame is not None and i >= end_frame:
+                    break
+                image_file = image_files[i]
+                image = Image.open(image_file).convert("RGB")
+                if bucket_selector is not None and bucket_reso is None:
+                    bucket_reso = bucket_selector.get_bucket_resolution(image.size)  # calc resolution from first frame
+                image = np.array(image)
+                if bucket_reso is not None:
+                    image = resize_image_to_bucket(image, bucket_reso)
+                video.append(image)
+    else:
+        # drop frames to match the target fps TODO commonize this code with the above if this works
+        frame_index_delta = target_fps / source_fps  # example: 16 / 30 = 0.5333
+        if os.path.isfile(video_path):
+            container = av.open(video_path)
+            video = []
+            frame_index_with_fraction = 0.0
+            previous_frame_index = -1
+            for i, frame in enumerate(container.decode(video=0)):
+                target_frame_index = int(frame_index_with_fraction)
+                frame_index_with_fraction += frame_index_delta
+                if target_frame_index == previous_frame_index:  # drop this frame
+                    continue
+                # accept this frame
+                previous_frame_index = target_frame_index
+                if start_frame is not None and target_frame_index < start_frame:
+                    continue
+                if end_frame is not None and target_frame_index >= end_frame:
+                    break
+                frame = frame.to_image()
+                if bucket_selector is not None and bucket_reso is None:
+                    bucket_reso = bucket_selector.get_bucket_resolution(frame.size)  # calc resolution from first frame
+                if bucket_reso is not None:
+                    frame = resize_image_to_bucket(frame, bucket_reso)
+                else:
+                    frame = np.array(frame)
+                video.append(frame)
+            container.close()
+        else:
+            # load images in the directory
+            image_files = glob_images(video_path)
+            image_files.sort()
+            video = []
+            frame_index_with_fraction = 0.0
+            previous_frame_index = -1
+            for i in range(len(image_files)):
+                target_frame_index = int(frame_index_with_fraction)
+                frame_index_with_fraction += frame_index_delta
+                if target_frame_index == previous_frame_index:  # drop this frame
+                    continue
+                # accept this frame
+                previous_frame_index = target_frame_index
+                if start_frame is not None and target_frame_index < start_frame:
+                    continue
+                if end_frame is not None and target_frame_index >= end_frame:
+                    break
+                image_file = image_files[i]
+                image = Image.open(image_file).convert("RGB")
+                if bucket_selector is not None and bucket_reso is None:
+                    bucket_reso = bucket_selector.get_bucket_resolution(image.size)  # calc resolution from first frame
+                image = np.array(image)
+                if bucket_reso is not None:
+                    image = resize_image_to_bucket(image, bucket_reso)
+                video.append(image)
     return video
 class BucketBatchManager:
+    def __init__(self, bucketed_item_info: dict[tuple[Any], list[ItemInfo]], batch_size: int):
         self.batch_size = batch_size
         self.buckets = bucketed_item_info
         self.bucket_resos = list(self.buckets.keys())
         self.bucket_resos.sort()
+        # indices for enumerating batches. each batch is reso + batch_idx. reso is (width, height) or (width, height, frames)
+        self.bucket_batch_indices: list[tuple[tuple[Any], int]] = []
         for bucket_reso in self.bucket_resos:
             bucket = self.buckets[bucket_reso]
             num_batches = math.ceil(len(bucket) / self.batch_size)
             for i in range(num_batches):
                 self.bucket_batch_indices.append((bucket_reso, i))
+        # do no shuffle here to avoid multiple datasets have different order
+        # self.shuffle()
     def show_bucket_info(self):
         for bucket_reso in self.bucket_resos:
         logger.info(f"total batches: {len(self)}")
     def shuffle(self):
+        # shuffle each bucket
         for bucket in self.buckets.values():
             random.shuffle(bucket)
+        # shuffle the order of batches
         random.shuffle(self.bucket_batch_indices)
     def __len__(self):
 class ContentDatasource:
     def __init__(self):
+        self.caption_only = False  # set to True to only fetch caption for Text Encoder caching
+        self.has_control = False
     def set_caption_only(self, caption_only: bool):
         self.caption_only = caption_only
 class ImageDirectoryDatasource(ImageDatasource):
+    def __init__(
+        self,
+        image_directory: str,
+        caption_extension: Optional[str] = None,
+        control_directory: Optional[str] = None,
+        control_count_per_image: int = 1,
+    ):
         super().__init__()
         self.image_directory = image_directory
         self.caption_extension = caption_extension
+        self.control_directory = control_directory
+        self.control_count_per_image = control_count_per_image
         self.current_idx = 0
         # glob images
         self.image_paths = glob_images(self.image_directory)
         logger.info(f"found {len(self.image_paths)} images")
+        # glob control images if specified
+        if self.control_directory is not None:
+            logger.info(f"glob control images in {self.control_directory}")
+            self.has_control = True
+            self.control_paths = {}
+            for image_path in self.image_paths:
+                image_basename = os.path.basename(image_path)
+                image_basename_no_ext = os.path.splitext(image_basename)[0]
+                potential_paths = glob.glob(os.path.join(self.control_directory, os.path.splitext(image_basename)[0] + "*.*"))
+                if potential_paths:
+                    # sort by the digits (`_0000`) suffix, prefer the one without the suffix
+                    def sort_key(path):
+                        basename = os.path.basename(path)
+                        basename_no_ext = os.path.splitext(basename)[0]
+                        if image_basename_no_ext == basename_no_ext:  # prefer the one without suffix
+                            return 0
+                        digits_suffix = basename_no_ext.rsplit("_", 1)[-1]
+                        if not digits_suffix.isdigit():
+                            raise ValueError(f"Invalid digits suffix in {basename_no_ext}")
+                        return int(digits_suffix) + 1
+                    potential_paths.sort(key=sort_key)
+                    if len(potential_paths) < control_count_per_image:
+                        logger.error(
+                            f"Not enough control images for {image_path}: found {len(potential_paths)}, expected {control_count_per_image}"
+                        )
+                        raise ValueError(
+                            f"Not enough control images for {image_path}: found {len(potential_paths)}, expected {control_count_per_image}"
+                        )
+                    # take the first `control_count_per_image` paths
+                    self.control_paths[image_path] = potential_paths[:control_count_per_image]
+            logger.info(f"found {len(self.control_paths)} matching control images")
+            missing_controls = len(self.image_paths) - len(self.control_paths)
+            if missing_controls > 0:
+                missing_control_paths = set(self.image_paths) - set(self.control_paths.keys())
+                logger.error(f"Could not find matching control images for {missing_controls} images: {missing_control_paths}")
+                raise ValueError(f"Could not find matching control images for {missing_controls} images")
     def is_indexable(self):
         return True
     def __len__(self):
         return len(self.image_paths)
+    def get_image_data(self, idx: int) -> tuple[str, Image.Image, str, Optional[Image.Image]]:
         image_path = self.image_paths[idx]
         image = Image.open(image_path).convert("RGB")
         _, caption = self.get_caption(idx)
+        controls = None
+        if self.has_control:
+            controls = []
+            for control_path in self.control_paths[image_path]:
+                control = Image.open(control_path)
+                if control.mode != "RGB" and control.mode != "RGBA":
+                    control = control.convert("RGB")
+                controls.append(control)
+        return image_path, image, caption, controls
     def get_caption(self, idx: int) -> tuple[str, str]:
         image_path = self.image_paths[idx]
 class ImageJsonlDatasource(ImageDatasource):
+    def __init__(self, image_jsonl_file: str, control_count_per_image: int = 1):
         super().__init__()
         self.image_jsonl_file = image_jsonl_file
+        self.control_count_per_image = control_count_per_image
         self.current_idx = 0
         # load jsonl
                 self.data.append(data)
         logger.info(f"loaded {len(self.data)} images")
+        # Normalize control paths
+        for item in self.data:
+            if "control_path" in item:
+                item["control_path_0"] = item.pop("control_path")
+            # Ensure control paths are named consistently, from control_path_0000 to control_path_0, control_path_1, etc.
+            control_path_keys = [key for key in item.keys() if key.startswith("control_path_")]
+            control_path_keys.sort(key=lambda x: int(x.split("_")[-1]))
+            for i, key in enumerate(control_path_keys):
+                if key != f"control_path_{i}":
+                    item[f"control_path_{i}"] = item.pop(key)
+        # Check if there are control paths in the JSONL
+        self.has_control = any("control_path_0" in item for item in self.data)
+        if self.has_control:
+            missing_control_images = [
+                item["image_path"]
+                for item in self.data
+                if sum(f"control_path_{i}" not in item for i in range(self.control_count_per_image)) > 0
+            ]
+            if missing_control_images:
+                logger.error(f"Some images do not have control paths in JSONL data: {missing_control_images}")
+                raise ValueError(f"Some images do not have control paths in JSONL data: {missing_control_images}")
+            logger.info(f"found {len(self.data)} images with {self.control_count_per_image} control images per image in JSONL data")
     def is_indexable(self):
         return True
     def __len__(self):
         return len(self.data)
+    def get_image_data(self, idx: int) -> tuple[str, Image.Image, str, Optional[list[Image.Image]]]:
         data = self.data[idx]
         image_path = data["image_path"]
         image = Image.open(image_path).convert("RGB")
         caption = data["caption"]
+        controls = None
+        if self.has_control:
+            controls = []
+            for i in range(self.control_count_per_image):
+                control_path = data[f"control_path_{i}"]
+                control = Image.open(control_path)
+                if control.mode != "RGB" and control.mode != "RGBA":
+                    control = control.convert("RGB")
+                controls.append(control)
+        return image_path, image, caption, controls
     def get_caption(self, idx: int) -> tuple[str, str]:
         data = self.data[idx]
         self.bucket_selector = None
+        self.source_fps = None
+        self.target_fps = None
     def __len__(self):
         raise NotImplementedError
         end_frame = end_frame if end_frame is not None else self.end_frame
         bucket_selector = bucket_selector if bucket_selector is not None else self.bucket_selector
+        video = load_video(
+            video_path, start_frame, end_frame, bucket_selector, source_fps=self.source_fps, target_fps=self.target_fps
+        )
         return video
+    def get_control_data_from_path(
+        self,
+        control_path: str,
+        start_frame: Optional[int] = None,
+        end_frame: Optional[int] = None,
+        bucket_selector: Optional[BucketSelector] = None,
+    ) -> list[Image.Image]:
+        start_frame = start_frame if start_frame is not None else self.start_frame
+        end_frame = end_frame if end_frame is not None else self.end_frame
+        bucket_selector = bucket_selector if bucket_selector is not None else self.bucket_selector
+        control = load_video(
+            control_path, start_frame, end_frame, bucket_selector, source_fps=self.source_fps, target_fps=self.target_fps
+        )
+        return control
     def set_start_and_end_frame(self, start_frame: Optional[int], end_frame: Optional[int]):
         self.start_frame = start_frame
         self.end_frame = end_frame
     def set_bucket_selector(self, bucket_selector: BucketSelector):
         self.bucket_selector = bucket_selector
+    def set_source_and_target_fps(self, source_fps: Optional[float], target_fps: Optional[float]):
+        self.source_fps = source_fps
+        self.target_fps = target_fps
     def __iter__(self):
         raise NotImplementedError
 class VideoDirectoryDatasource(VideoDatasource):
+    def __init__(self, video_directory: str, caption_extension: Optional[str] = None, control_directory: Optional[str] = None):
         super().__init__()
         self.video_directory = video_directory
         self.caption_extension = caption_extension
+        self.control_directory = control_directory  # 新しく追加: コントロール画像ディレクトリ
         self.current_idx = 0
+        # glob videos
+        logger.info(f"glob videos in {self.video_directory}")
         self.video_paths = glob_videos(self.video_directory)
         logger.info(f"found {len(self.video_paths)} videos")
+        # glob control images if specified
+        if self.control_directory is not None:
+            logger.info(f"glob control videos in {self.control_directory}")
+            self.has_control = True
+            self.control_paths = {}
+            for video_path in self.video_paths:
+                video_basename = os.path.basename(video_path)
+                # construct control path from video path
+                # for example: video_path = "vid/video.mp4" -> control_path = "control/video.mp4"
+                control_path = os.path.join(self.control_directory, video_basename)
+                if os.path.exists(control_path):
+                    self.control_paths[video_path] = control_path
+                else:
+                    # use the same base name for control path
+                    base_name = os.path.splitext(video_basename)[0]
+                    # directory with images. for example: video_path = "vid/video.mp4" -> control_path = "control/video"
+                    potential_path = os.path.join(self.control_directory, base_name)  # no extension
+                    if os.path.isdir(potential_path):
+                        self.control_paths[video_path] = potential_path
+                    else:
+                        # another extension for control path
+                        # for example: video_path = "vid/video.mp4" -> control_path = "control/video.mov"
+                        for ext in VIDEO_EXTENSIONS:
+                            potential_path = os.path.join(self.control_directory, base_name + ext)
+                            if os.path.exists(potential_path):
+                                self.control_paths[video_path] = potential_path
+                                break
+            logger.info(f"found {len(self.control_paths)} matching control videos/images")
+            # check if all videos have matching control paths, if not, raise an error
+            missing_controls = len(self.video_paths) - len(self.control_paths)
+            if missing_controls > 0:
+                # logger.warning(f"Could not find matching control videos/images for {missing_controls} videos")
+                missing_controls_videos = [video_path for video_path in self.video_paths if video_path not in self.control_paths]
+                logger.error(
+                    f"Could not find matching control videos/images for {missing_controls} videos: {missing_controls_videos}"
+                )
+                raise ValueError(f"Could not find matching control videos/images for {missing_controls} videos")
     def is_indexable(self):
         return True
         start_frame: Optional[int] = None,
         end_frame: Optional[int] = None,
         bucket_selector: Optional[BucketSelector] = None,
+    ) -> tuple[str, list[Image.Image], str, Optional[list[Image.Image]]]:
         video_path = self.video_paths[idx]
         video = self.get_video_data_from_path(video_path, start_frame, end_frame, bucket_selector)
         _, caption = self.get_caption(idx)
+        control = None
+        if self.control_directory is not None and video_path in self.control_paths:
+            control_path = self.control_paths[video_path]
+            control = self.get_control_data_from_path(control_path, start_frame, end_frame, bucket_selector)
+        return video_path, video, caption, control
     def get_caption(self, idx: int) -> tuple[str, str]:
         video_path = self.video_paths[idx]
                 self.data.append(data)
         logger.info(f"loaded {len(self.data)} videos")
+        # Check if there are control paths in the JSONL
+        self.has_control = any("control_path" in item for item in self.data)
+        if self.has_control:
+            control_count = sum(1 for item in self.data if "control_path" in item)
+            if control_count < len(self.data):
+                missing_control_videos = [item["video_path"] for item in self.data if "control_path" not in item]
+                logger.error(f"Some videos do not have control paths in JSONL data: {missing_control_videos}")
+                raise ValueError(f"Some videos do not have control paths in JSONL data: {missing_control_videos}")
+            logger.info(f"found {control_count} control videos/images in JSONL data")
     def is_indexable(self):
         return True
         start_frame: Optional[int] = None,
         end_frame: Optional[int] = None,
         bucket_selector: Optional[BucketSelector] = None,
+    ) -> tuple[str, list[Image.Image], str, Optional[list[Image.Image]]]:
         data = self.data[idx]
         video_path = data["video_path"]
         video = self.get_video_data_from_path(video_path, start_frame, end_frame, bucket_selector)
         caption = data["caption"]
+        control = None
+        if "control_path" in data and data["control_path"]:
+            control_path = data["control_path"]
+            control = self.get_control_data_from_path(control_path, start_frame, end_frame, bucket_selector)
+        return video_path, video, caption, control
     def get_caption(self, idx: int) -> tuple[str, str]:
         data = self.data[idx]
         bucket_no_upscale: bool,
         image_directory: Optional[str] = None,
         image_jsonl_file: Optional[str] = None,
+        control_directory: Optional[str] = None,
         cache_directory: Optional[str] = None,
+        fp_latent_window_size: Optional[int] = 9,
+        fp_1f_clean_indices: Optional[list[int]] = None,
+        fp_1f_target_index: Optional[int] = None,
+        fp_1f_no_post: Optional[bool] = False,
         debug_dataset: bool = False,
         architecture: str = "no_default",
     ):
         )
         self.image_directory = image_directory
         self.image_jsonl_file = image_jsonl_file
+        self.control_directory = control_directory
+        self.fp_latent_window_size = fp_latent_window_size
+        self.fp_1f_clean_indices = fp_1f_clean_indices
+        self.fp_1f_target_index = fp_1f_target_index
+        self.fp_1f_no_post = fp_1f_no_post
+        control_count_per_image = 1
+        if fp_1f_clean_indices is not None:
+            control_count_per_image = len(fp_1f_clean_indices)
         if image_directory is not None:
+            self.datasource = ImageDirectoryDatasource(
+                image_directory, caption_extension, control_directory, control_count_per_image
+            )
         elif image_jsonl_file is not None:
+            self.datasource = ImageJsonlDatasource(image_jsonl_file, control_count_per_image)
         else:
             raise ValueError("image_directory or image_jsonl_file must be specified")
         self.batch_manager = None
         self.num_train_items = 0
+        self.has_control = self.datasource.has_control
     def get_metadata(self):
         metadata = super().get_metadata()
             metadata["image_directory"] = os.path.basename(self.image_directory)
         if self.image_jsonl_file is not None:
             metadata["image_jsonl_file"] = os.path.basename(self.image_jsonl_file)
+        if self.control_directory is not None:
+            metadata["control_directory"] = os.path.basename(self.control_directory)
+        metadata["has_control"] = self.has_control
         return metadata
     def get_total_image_count(self):
                         break  # submit batch if possible
                 for future in completed_futures:
+                    original_size, item_key, image, caption, controls = future.result()
                     bucket_height, bucket_width = image.shape[:2]
                     bucket_reso = (bucket_width, bucket_height)
                     item_info = ItemInfo(item_key, caption, original_size, bucket_reso, content=image)
                     item_info.latent_cache_path = self.get_latent_cache_path(item_info)
+                    item_info.fp_latent_window_size = self.fp_latent_window_size
+                    item_info.fp_1f_clean_indices = self.fp_1f_clean_indices
+                    item_info.fp_1f_target_index = self.fp_1f_target_index
+                    item_info.fp_1f_no_post = self.fp_1f_no_post
+                    if self.architecture == ARCHITECTURE_FRAMEPACK:
+                        # we need to split the bucket with latent window size and optional 1f clean indices, zero post
+                        bucket_reso = list(bucket_reso) + [self.fp_latent_window_size]
+                        if self.fp_1f_clean_indices is not None:
+                            bucket_reso.append(len(self.fp_1f_clean_indices))
+                            bucket_reso.append(self.fp_1f_no_post)
+                        bucket_reso = tuple(bucket_reso)
+                    if controls is not None:
+                        item_info.control_content = controls
                     if bucket_reso not in batches:
                         batches[bucket_reso] = []
         for fetch_op in self.datasource:
             # fetch and resize image in a separate thread
+            def fetch_and_resize(op: callable) -> tuple[tuple[int, int], str, Image.Image, str, Optional[Image.Image]]:
+                image_key, image, caption, controls = op()
                 image: Image.Image
                 image_size = image.size
                 bucket_reso = buckset_selector.get_bucket_resolution(image_size)
+                image = resize_image_to_bucket(image, bucket_reso)  # returns np.ndarray
+                resized_controls = None
+                if controls is not None:
+                    resized_controls = []
+                    for control in controls:
+                        resized_control = resize_image_to_bucket(control, bucket_reso)  # returns np.ndarray
+                        resized_controls.append(resized_control)
+                return image_size, image_key, image, caption, resized_controls
             future = executor.submit(fetch_and_resize, fetch_op)
             futures.append(future)
                 continue
             bucket_reso = bucket_selector.get_bucket_resolution(image_size)
+            if self.architecture == ARCHITECTURE_FRAMEPACK:
+                # we need to split the bucket with latent window size and optional 1f clean indices, zero post
+                bucket_reso = list(bucket_reso) + [self.fp_latent_window_size]
+                if self.fp_1f_clean_indices is not None:
+                    bucket_reso.append(len(self.fp_1f_clean_indices))
+                    bucket_reso.append(self.fp_1f_no_post)
+                bucket_reso = tuple(bucket_reso)
             item_info = ItemInfo(item_key, "", image_size, bucket_reso, latent_cache_path=cache_file)
             item_info.text_encoder_output_cache_path = text_encoder_output_cache_file
 class VideoDataset(BaseDataset):
+    TARGET_FPS_HUNYUAN = 24.0
+    TARGET_FPS_WAN = 16.0
+    TARGET_FPS_FRAMEPACK = 30.0
     def __init__(
         self,
         resolution: Tuple[int, int],
         frame_stride: Optional[int] = 1,
         frame_sample: Optional[int] = 1,
         target_frames: Optional[list[int]] = None,
+        max_frames: Optional[int] = None,
+        source_fps: Optional[float] = None,
         video_directory: Optional[str] = None,
         video_jsonl_file: Optional[str] = None,
+        control_directory: Optional[str] = None,
         cache_directory: Optional[str] = None,
+        fp_latent_window_size: Optional[int] = 9,
         debug_dataset: bool = False,
         architecture: str = "no_default",
     ):
         )
         self.video_directory = video_directory
         self.video_jsonl_file = video_jsonl_file
+        self.control_directory = control_directory
         self.frame_extraction = frame_extraction
         self.frame_stride = frame_stride
         self.frame_sample = frame_sample
+        self.max_frames = max_frames
+        self.source_fps = source_fps
+        self.fp_latent_window_size = fp_latent_window_size
+        if self.architecture == ARCHITECTURE_HUNYUAN_VIDEO:
+            self.target_fps = VideoDataset.TARGET_FPS_HUNYUAN
+        elif self.architecture == ARCHITECTURE_WAN:
+            self.target_fps = VideoDataset.TARGET_FPS_WAN
+        elif self.architecture == ARCHITECTURE_FRAMEPACK:
+            self.target_fps = VideoDataset.TARGET_FPS_FRAMEPACK
+        else:
+            raise ValueError(f"Unsupported architecture: {self.architecture}")
+        if target_frames is not None:
+            target_frames = list(set(target_frames))
+            target_frames.sort()
+            # round each value to N*4+1
+            rounded_target_frames = [(f - 1) // 4 * 4 + 1 for f in target_frames]
+            rouneded_target_frames = list(set(rounded_target_frames))
+            rouneded_target_frames.sort()
+            # if value is changed, warn
+            if target_frames != rounded_target_frames:
+                logger.warning(f"target_frames are rounded to {rounded_target_frames}")
+            target_frames = tuple(rounded_target_frames)
+        self.target_frames = target_frames
         if video_directory is not None:
+            self.datasource = VideoDirectoryDatasource(video_directory, caption_extension, control_directory)
         elif video_jsonl_file is not None:
             self.datasource = VideoJsonlDatasource(video_jsonl_file)
         self.batch_manager = None
         self.num_train_items = 0
+        self.has_control = self.datasource.has_control
     def get_metadata(self):
         metadata = super().get_metadata()
             metadata["video_directory"] = os.path.basename(self.video_directory)
         if self.video_jsonl_file is not None:
             metadata["video_jsonl_file"] = os.path.basename(self.video_jsonl_file)
+        if self.control_directory is not None:
+            metadata["control_directory"] = os.path.basename(self.control_directory)
         metadata["frame_extraction"] = self.frame_extraction
         metadata["frame_stride"] = self.frame_stride
         metadata["frame_sample"] = self.frame_sample
         metadata["target_frames"] = self.target_frames
+        metadata["max_frames"] = self.max_frames
+        metadata["source_fps"] = self.source_fps
+        metadata["has_control"] = self.has_control
         return metadata
     def retrieve_latent_cache_batches(self, num_workers: int):
         buckset_selector = BucketSelector(self.resolution, architecture=self.architecture)
         self.datasource.set_bucket_selector(buckset_selector)
+        if self.source_fps is not None:
+            self.datasource.set_source_and_target_fps(self.source_fps, self.target_fps)
+        else:
+            self.datasource.set_source_and_target_fps(None, None)  # no conversion
         executor = ThreadPoolExecutor(max_workers=num_workers)
+        # key: (width, height, frame_count) and optional latent_window_size, value: [ItemInfo]
+        batches: dict[tuple[Any], list[ItemInfo]] = {}
         futures = []
         def aggregate_future(consume_all: bool = False):
                         break  # submit batch if possible
                 for future in completed_futures:
+                    original_frame_size, video_key, video, caption, control = future.result()
                     frame_count = len(video)
                     video = np.stack(video, axis=0)
                     height, width = video.shape[1:3]
                     bucket_reso = (width, height)  # already resized
+                    # process control images if available
+                    control_video = None
+                    if control is not None:
+                        # set frame count to the same as video
+                        if len(control) > frame_count:
+                            control = control[:frame_count]
+                        elif len(control) < frame_count:
+                            # if control is shorter than video, repeat the last frame
+                            last_frame = control[-1]
+                            control.extend([last_frame] * (frame_count - len(control)))
+                        control_video = np.stack(control, axis=0)
                     crop_pos_and_frames = []
                     if self.frame_extraction == "head":
                         for target_frame in self.target_frames:
                                 frame_indices = np.linspace(0, frame_count - target_frame, self.frame_sample, dtype=int)
                                 for i in frame_indices:
                                     crop_pos_and_frames.append((i, target_frame))
+                    elif self.frame_extraction == "full":
+                        # select all frames
+                        target_frame = min(frame_count, self.max_frames)
+                        target_frame = (target_frame - 1) // 4 * 4 + 1  # round to N*4+1
+                        crop_pos_and_frames.append((0, target_frame))
                     else:
                         raise ValueError(f"frame_extraction {self.frame_extraction} is not supported")
                         item_key = f"{body}_{crop_pos:05d}-{target_frame:03d}{ext}"
                         batch_key = (*bucket_reso, target_frame)  # bucket_reso with frame_count
+                        if self.architecture == ARCHITECTURE_FRAMEPACK:
+                            # add latent window size to bucket resolution
+                            batch_key = (*batch_key, self.fp_latent_window_size)
+                        # crop control video if available
+                        cropped_control = None
+                        if control_video is not None:
+                            cropped_control = control_video[crop_pos : crop_pos + target_frame]
                         item_info = ItemInfo(
                             item_key, caption, original_frame_size, batch_key, frame_count=target_frame, content=cropped_video
                         )
                         item_info.latent_cache_path = self.get_latent_cache_path(item_info)
+                        item_info.control_content = cropped_control  # None is allowed
+                        item_info.fp_latent_window_size = self.fp_latent_window_size
                         batch = batches.get(batch_key, [])
                         batch.append(item_info)
         for operator in self.datasource:
+            def fetch_and_resize(op: callable) -> tuple[tuple[int, int], str, list[np.ndarray], str, Optional[list[np.ndarray]]]:
+                result = op()
+                if len(result) == 3:  # for backward compatibility TODO remove this in the future
+                    video_key, video, caption = result
+                    control = None
+                else:
+                    video_key, video, caption, control = result
                 video: list[np.ndarray]
                 frame_size = (video[0].shape[1], video[0].shape[0])
                 bucket_reso = buckset_selector.get_bucket_resolution(frame_size)
                 video = [resize_image_to_bucket(frame, bucket_reso) for frame in video]
+                # resize control if necessary
+                if control is not None:
+                    control = [resize_image_to_bucket(frame, bucket_reso) for frame in control]
+                return frame_size, video_key, video, caption, control
             future = executor.submit(fetch_and_resize, operator)
             futures.append(future)
             image_width, image_height = map(int, image_size.split("x"))
             image_size = (image_width, image_height)
+            frame_pos, frame_count = tokens[-3].split("-")[:2]  # "00000-000", or optional section index "00000-000-00"
             frame_pos, frame_count = int(frame_pos), int(frame_count)
             item_key = "_".join(tokens[:-3])

docs/advanced_config.md CHANGED Viewed

@@ -2,6 +2,16 @@
 # Advanced configuration / 高度な設定
 ## How to specify `network_args` / `network_args`の指定方法
 The `--network_args` option is an option for specifying detailed arguments to LoRA. Specify the arguments in the form of `key=value` in `--network_args`.
@@ -148,4 +158,159 @@ Specify the project name with `--log_tracker_name` when using wandb.
 `--log_with wandb`オプションを指定するとwandb形式でログを保存することができます。`tensorboard`や`all`も指定可能です。デフォルトは`tensorboard`です。
 wandbを使用する場合は、`--log_tracker_name`でプロジェクト名を指定してください。
-</details>

 # Advanced configuration / 高度な設定
+## Table of contents / 目次
+- [How to specify `network_args`](#how-to-specify-network_args--network_argsの指定方法)
+- [LoRA+](#lora)
+- [Select the target modules of LoRA](#select-the-target-modules-of-lora--loraの対象モジュールを選択する)
+- [Save and view logs in TensorBoard format](#save-and-view-logs-in-tensorboard-format--tensorboard形式のログの保存と参照)
+- [Save and view logs in wandb](#save-and-view-logs-in-wandb--wandbでログの保存と参照)
+- [FP8 weight optimization for models](#fp8-weight-optimization-for-models--モデルの重みのfp8への最適化)
+- [PyTorch Dynamo optimization for model training](#pytorch-dynamo-optimization-for-model-training--モデルの学習におけるpytorch-dynamoの最適化)
 ## How to specify `network_args` / `network_args`の指定方法
 The `--network_args` option is an option for specifying detailed arguments to LoRA. Specify the arguments in the form of `key=value` in `--network_args`.
 `--log_with wandb`オプションを指定するとwandb形式でログを保存することができます。`tensorboard`や`all`も指定可能です。デフォルトは`tensorboard`です。
 wandbを使用する場合は、`--log_tracker_name`でプロジェクト名を指定してください。
+</details>
+## FP8 weight optimization for models / モデルの重みのFP8への最適化
+The `--fp8_scaled` option is available to quantize the weights of the model to FP8 (E4M3) format with appropriate scaling. This reduces the VRAM usage while maintaining precision. Important weights are kept in FP16/BF16/FP32 format.
+The model weights must be in fp16 or bf16. Weights that have been pre-converted to float8_e4m3 cannot be used.
+Wan2.1 inference and training are supported.
+Specify the `--fp8_scaled` option in addition to the `--fp8` option during inference.
+Specify the `--fp8_scaled` option in addition to the `--fp8_base` option during training.
+Acknowledgments: This feature is based on the [implementation](https://github.com/Tencent/HunyuanVideo/blob/7df4a45c7e424a3f6cd7d653a7ff1f60cddc1eb1/hyvideo/modules/fp8_optimization.py) of [HunyuanVideo](https://github.com/Tencent/HunyuanVideo). The selection of high-precision modules is based on the [implementation](https://github.com/tdrussell/diffusion-pipe/blob/407c04fdae1c9ab5e67b54d33bef62c3e0a8dbc7/models/wan.py) of [diffusion-pipe](https://github.com/tdrussell/diffusion-pipe). I would like to thank these repositories.
+<details>
+<summary>日本語</summary>
+重みを単純にFP8へcastするのではなく、適切なスケーリングでFP8形式に量子化することで、精度を維持しつつVRAM使用量を削減します。また、重要な重みはFP16/BF16/FP32形式で保持します。
+モデルの重みは、fp16またはbf16が必要です。あらかじめfloat8_e4m3に変換された重みは使用できません。
+Wan2.1の推論、学習のみ対応しています。
+推論時は`--fp8`オプションに加えて `--fp8_scaled`オプションを指定してください。
+学習時は`--fp8_base`オプションに加えて `--fp8_scaled`オプションを指定してください。
+謝辞：この機能は、[HunyuanVideo](https://github.com/Tencent/HunyuanVideo)の[実装](https://github.com/Tencent/HunyuanVideo/blob/7df4a45c7e424a3f6cd7d653a7ff1f60cddc1eb1/hyvideo/modules/fp8_optimization.py)を参考にしました。また、高精度モジュールの選択においては[diffusion-pipe](https://github.com/tdrussell/diffusion-pipe)の[実装](https://github.com/tdrussell/diffusion-pipe/blob/407c04fdae1c9ab5e67b54d33bef62c3e0a8dbc7/models/wan.py)を参考にしました。これらのリポジトリに感謝します。
+</details>
+### Key features and implementation details / 主な特徴と実装の詳細
+- Implements FP8 (E4M3) weight quantization for Linear layers
+- Reduces VRAM requirements by using 8-bit weights for storage (slightly increased compared to existing `--fp8` `--fp8_base` options)
+- Quantizes weights to FP8 format with appropriate scaling instead of simple cast to FP8
+- Maintains computational precision by dequantizing to original precision (FP16/BF16/FP32) during forward pass
+- Preserves important weights in FP16/BF16/FP32 format
+The implementation:
+1. Quantizes weights to FP8 format with appropriate scaling
+2. Replaces weights by FP8 quantized weights and stores scale factors in model state dict
+3. Applies monkey patching to Linear layers for transparent dequantization during computation
+<details>
+<summary>日本語</summary>
+- Linear層のFP8（E4M3）重み量子化を実装
+- 8ビットの重みを使用することでVRAM使用量を削減（既存の`--fp8` `--fp8_base` オプションに比べて微増）
+- 単純なFP8へのcastではなく、適切な値でスケールして重���をFP8形式に量子化
+- forward時に元の精度（FP16/BF16/FP32）に逆量子化して計算精度を維持
+- 精度が重要な重みはFP16/BF16/FP32のまま保持
+実装:
+1. 精度を維持できる適切な倍率で重みをFP8形式に量子化
+2. 重みをFP8量子化重みに置き換え、倍率をモデルのstate dictに保存
+3. Linear層にmonkey patchingすることでモデルを変更せずに逆量子化
+ </details>
+ ## PyTorch Dynamo optimization for model training / モデルの学習におけるPyTorch Dynamoの最適化
+The PyTorch Dynamo options are now available to optimize the training process. PyTorch Dynamo is a Python-level JIT compiler designed to make unmodified PyTorch programs faster by using TorchInductor, a deep learning compiler. This integration allows for potential speedups in training while maintaining model accuracy.
+[PR #215](https://github.com/kohya-ss/musubi-tuner/pull/215) added this feature.
+Specify the `--dynamo_backend` option to enable Dynamo optimization with one of the available backends from the `DynamoBackend` enum.
+Additional options allow for fine-tuning the Dynamo behavior:
+- `--dynamo_mode`: Controls the optimization strategy
+- `--dynamo_fullgraph`: Enables fullgraph mode for potentially better optimization
+- `--dynamo_dynamic`: Enables dynamic shape handling
+The `--dynamo_dynamic` option has been reported to have many problems based on the validation in PR #215.
+### Available options:
+```
+--dynamo_backend {NO, INDUCTOR, NVFUSER, CUDAGRAPHS, CUDAGRAPHS_FALLBACK, etc.}
+    Specifies the Dynamo backend to use (default is NO, which disables Dynamo)
+--dynamo_mode {default, reduce-overhead, max-autotune}
+    Specifies the optimization mode (default is 'default')
+    - 'default': Standard optimization
+    - 'reduce-overhead': Focuses on reducing compilation overhead
+    - 'max-autotune': Performs extensive autotuning for potentially better performance
+--dynamo_fullgraph
+    Flag to enable fullgraph mode, which attempts to capture and optimize the entire model graph
+--dynamo_dynamic
+    Flag to enable dynamic shape handling for models with variable input shapes
+```
+### Usage example:
+```bash
+python train_video_model.py --dynamo_backend INDUCTOR --dynamo_mode default
+```
+For more aggressive optimization:
+```bash
+python train_video_model.py --dynamo_backend INDUCTOR --dynamo_mode max-autotune --dynamo_fullgraph
+```
+Note: The best combination of options may depend on your specific model and hardware. Experimentation may be necessary to find the optimal configuration.
+<details>
+<summary>日本語</summary>
+PyTorch Dynamoオプションが学習プロセスを最適化するために追加されました。PyTorch Dynamoは、TorchInductor（ディープラーニングコンパイラ）を使用して、変更を加えることなくPyTorchプログラムを高速化するためのPythonレベルのJITコンパイラです。この統合により、モデルの精度を維持しながら学習の高速化が期待できます。
+[PR #215](https://github.com/kohya-ss/musubi-tuner/pull/215) で追加されました。
+`--dynamo_backend`オプションを指定して、`DynamoBackend`列挙型から利用可能なバックエンドの一つを選択することで、Dynamo最適化を有効にします。
+追加のオプションにより、Dynamoの動作を微調整できます：
+- `--dynamo_mode`：最適化戦略を制御します
+- `--dynamo_fullgraph`：より良い最適化の可能性のためにフルグラフモードを有効にします
+- `--dynamo_dynamic`：動的形状処理を有効にします
+PR #215での検証によると、`--dynamo_dynamic`には問題が多いことが報告されています。
+__利用可能なオプション：__
+```
+--dynamo_backend {NO, INDUCTOR, NVFUSER, CUDAGRAPHS, CUDAGRAPHS_FALLBACK, など}
+    使用するDynamoバックエンドを指定します（デフォルトはNOで、Dynamoを無効にします）
+--dynamo_mode {default, reduce-overhead, max-autotune}
+    最適化モードを指定します（デフォルトは 'default'）
+    - 'default'：標準的な最適化
+    - 'reduce-overhead'：コンパイルのオーバーヘッド削減に焦点を当てる
+    - 'max-autotune'：より良いパフォーマンスのために広範な自動調整を実行
+--dynamo_fullgraph
+    フルグラフモードを有効にするフラグ。モデルグラフ全体をキャプチャして最適化しようとします
+--dynamo_dynamic
+    可変入力形状を持つモデルのための動的形状処理を有効にするフラグ
+```
+__使用例：__
+```bash
+python train_video_model.py --dynamo_backend INDUCTOR --dynamo_mode default
+```
+より積極的な最適化の場合：
+```bash
+python train_video_model.py --dynamo_backend INDUCTOR --dynamo_mode max-autotune --dynamo_fullgraph
+```
+注意：最適なオプションの組み合わせは、特定のモデルとハードウェアに依存する場���があります。最適な構成を見つけるために実験が必要かもしれません。
+</details>

docs/framepack.md ADDED Viewed

	@@ -0,0 +1,607 @@

+# FramePack
+## Overview / 概要
+This document describes the usage of the [FramePack](https://github.com/lllyasviel/FramePack) architecture within the Musubi Tuner framework. FramePack is a novel video generation architecture developed by lllyasviel.
+Key differences from HunyuanVideo:
+- FramePack only supports Image-to-Video (I2V) generation. Text-to-Video (T2V) is not supported.
+- It utilizes a different DiT model architecture and requires an additional Image Encoder. VAE is same as HunyuanVideo. Text Encoders seem to be the same as HunyuanVideo but we employ the original FramePack method to utilize them.
+- Caching and training scripts are specific to FramePack (`fpack_*.py`).
+- Due to its progressive generation nature, VRAM usage can be significantly lower, especially for longer videos, compared to other architectures.
+The official documentation does not provide detailed explanations on how to train the model, but it is based on the FramePack implementation and paper.
+This feature is experimental.
+For one-frame inference and training, see [here](./framepack_1f.md).
+<details>
+<summary>日本語</summary>
+このドキュメントは、Musubi Tunerフレームワーク内での[FramePack](https://github.com/lllyasviel/FramePack) アーキテクチャの使用法について説明しています。FramePackは、lllyasviel氏にによって開発された新しいビデオ生成アーキテクチャです。
+HunyuanVideoとの主な違いは次のとおりです。
+- FramePackは、画像からビデオ（I2V）生成のみをサポートしています。テキストからビデオ（T2V）はサポートされていません。
+- 異なるDiTモデルアーキテクチャを使用し、追加の画像エンコーダーが必要です。VAEはHunyuanVideoと同じです。テキストエンコーダーはHunyuanVideoと同じと思われますが、FramePack公式と同じ方法で推論を行っています。
+- キャッシングと学習スクリプトはFramePack専用（`fpack_*.py`）です。
+- セクションずつ生成するため、他のアーキテクチャと比較して、特に長いビデオの場合、VRAM使用量が大幅に少なくなる可能性があります。
+学習方法について公式からは詳細な説明はありませんが、FramePackの実装と論文を参考にしています。
+この機能は実験的なものです。
+1フレーム推論、学習については[こちら](./framepack_1f.md)を参照してください。
+</details>
+## Download the model / モデルのダウンロード
+You need to download the DiT, VAE, Text Encoder 1 (LLaMA), Text Encoder 2 (CLIP), and Image Encoder (SigLIP) models specifically for FramePack. Several download options are available for each component.
+***Note:** The weights are publicly available on the following page: [maybleMyers/framepack_h1111](https://huggingface.co/maybleMyers/framepack_h1111) (except for FramePack-F1). Thank you maybleMyers!
+### DiT Model
+Choose one of the following methods:
+1.  **From lllyasviel's Hugging Face repo:** Download the three `.safetensors` files (starting with `diffusion_pytorch_model-00001-of-00003.safetensors`) from [lllyasviel/FramePackI2V_HY](https://huggingface.co/lllyasviel/FramePackI2V_HY). Specify the path to the first file (`...-00001-of-00003.safetensors`) as the `--dit` argument. For FramePack-F1, download from [lllyasviel/FramePack_F1_I2V_HY_20250503](https://huggingface.co/lllyasviel/FramePack_F1_I2V_HY_20250503).
+2.  **From local FramePack installation:** If you have cloned and run the official FramePack repository, the model might be downloaded locally. Specify the path to the snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--lllyasviel--FramePackI2V_HY/snapshots/<hex-uuid-folder>`. FramePack-F1 is also available in the same way.
+3.  **From Kijai's Hugging Face repo:** Download the single file `FramePackI2V_HY_bf16.safetensors` from [Kijai/HunyuanVideo_comfy](https://huggingface.co/Kijai/HunyuanVideo_comfy/blob/main/FramePackI2V_HY_bf16.safetensors). Specify the path to this file as the `--dit` argument. No FramePack-F1 model is available here currently.
+### VAE Model
+Choose one of the following methods:
+1.  **Use official HunyuanVideo VAE:** Follow the instructions in the main [README.md](../README.md#model-download).
+2.  **From hunyuanvideo-community Hugging Face repo:** Download `vae/diffusion_pytorch_model.safetensors` from [hunyuanvideo-community/HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo).
+3.  **From local FramePack installation:** If you have cloned and run the official FramePack repository, the VAE might be downloaded locally within the HunyuanVideo community model snapshot. Specify the path to the snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--hunyuanvideo-community--HunyuanVideo/snapshots/<hex-uuid-folder>`.
+### Text Encoder 1 (LLaMA) Model
+Choose one of the following methods:
+1.  **From Comfy-Org Hugging Face repo:** Download `split_files/text_encoders/llava_llama3_fp16.safetensors` from [Comfy-Org/HunyuanVideo_repackaged](https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged).
+2.  **From hunyuanvideo-community Hugging Face repo:** Download the four `.safetensors` files (starting with `text_encoder/model-00001-of-00004.safetensors`) from [hunyuanvideo-community/HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo). Specify the path to the first file (`...-00001-of-00004.safetensors`) as the `--text_encoder1` argument.
+3.  **From local FramePack installation:** (Same as VAE) Specify the path to the HunyuanVideo community model snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--hunyuanvideo-community--HunyuanVideo/snapshots/<hex-uuid-folder>`.
+### Text Encoder 2 (CLIP) Model
+Choose one of the following methods:
+1.  **From Comfy-Org Hugging Face repo:** Download `split_files/text_encoders/clip_l.safetensors` from [Comfy-Org/HunyuanVideo_repackaged](https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged).
+2.  **From hunyuanvideo-community Hugging Face repo:** Download `text_encoder_2/model.safetensors` from [hunyuanvideo-community/HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo).
+3.  **From local FramePack installation:** (Same as VAE) Specify the path to the HunyuanVideo community model snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--hunyuanvideo-community--HunyuanVideo/snapshots/<hex-uuid-folder>`.
+### Image Encoder (SigLIP) Model
+Choose one of the following methods:
+1.  **From Comfy-Org Hugging Face repo:** Download `sigclip_vision_patch14_384.safetensors` from [Comfy-Org/sigclip_vision_384](https://huggingface.co/Comfy-Org/sigclip_vision_384).
+2.  **From lllyasviel's Hugging Face repo:** Download `image_encoder/model.safetensors` from [lllyasviel/flux_redux_bfl](https://huggingface.co/lllyasviel/flux_redux_bfl).
+3.  **From local FramePack installation:** If you have cloned and run the official FramePack repository, the model might be downloaded locally. Specify the path to the snapshot directory, e.g., `path/to/FramePack/hf_download/hub/models--lllyasviel--flux_redux_bfl/snapshots/<hex-uuid-folder>`.
+<details>
+<summary>日本語</summary>
+※以下のページに重みが一括で公開されています（FramePack-F1を除く）。maybleMyers 氏に感謝いたします。: https://huggingface.co/maybleMyers/framepack_h1111
+DiT、VAE、テキストエンコーダー1（LLaMA）、テキストエンコーダー2（CLIP）、および画像エンコーダー（SigLIP）モデルは複数の方法でダウンロードできます。英語の説明を参考にして、ダウンロードしてください。
+FramePack公式のリポジトリをクローンして実行した場合、モデルはローカルにダウンロードされている可能性があります。スナップショットディレクトリへのパスを指定してください。例：`path/to/FramePack/hf_download/hub/models--lllyasviel--flux_redux_bfl/snapshots/<hex-uuid-folder>`
+HunyuanVideoの推論をComfyUIですでに行っている場合、いくつかのモデルはすでにダウンロードされている可能性があります。
+</details>
+## Pre-caching / 事前キャッシング
+The default resolution for FramePack is 640x640. See [the source code](../frame_pack/bucket_tools.py) for the default resolution of each bucket.
+The dataset for training must be a video dataset. Image datasets are not supported. You can train on videos of any length. Specify `frame_extraction` as `full` and set `max_frames` to a sufficiently large value. However, if the video is too long, you may run out of VRAM during VAE encoding.
+### Latent Pre-caching / latentの事前キャッシング
+Latent pre-caching uses a dedicated script for FramePack. You **must** provide the Image Encoder model.
+```bash
+python fpack_cache_latents.py \
+    --dataset_config path/to/toml \
+    --vae path/to/vae_model.safetensors \
+    --image_encoder path/to/image_encoder_model.safetensors \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128
+```
+Key differences from HunyuanVideo caching:
+-  Uses `fpack_cache_latents.py`.
+-  Requires the `--image_encoder` argument pointing to the downloaded SigLIP model.
+-  The script generates multiple cache files per video, each corresponding to a different section, with the section index appended to the filename (e.g., `..._frame_pos-0000-count_...` becomes `..._frame_pos-0000-0000-count_...`, `..._frame_pos-0000-0001-count_...`, etc.).
+-   Image embeddings are calculated using the Image Encoder and stored in the cache files alongside the latents.
+For VRAM savings during VAE decoding, consider using `--vae_chunk_size` and `--vae_spatial_tile_sample_min_size`. If VRAM is overflowing and using shared memory, it is recommended to set `--vae_chunk_size` to 16 or 8, and `--vae_spatial_tile_sample_min_size` to 64 or 32.
+Specifying `--f1` is required for FramePack-F1 training. For one-frame training, specify `--one_frame`. If you change the presence of these options, please overwrite the existing cache without specifying `--skip_existing`.
+`--one_frame_no_2x` and `--one_frame_no_4x` options are available for one-frame training, described in the next section.
+**FramePack-F1 support:**
+You can apply the FramePack-F1 sampling method by specifying `--f1` during caching. The training script also requires specifying `--f1` to change the options during sample generation.
+By default, the sampling method used is Inverted anti-drifting (the same as during inference with the original FramePack model, using the latent and index in reverse order), described in the paper. You can switch to FramePack-F1 sampling (Vanilla sampling, using the temporally ordered latent and index) by specifying `--f1`.
+<details>
+<summary>日本語</summary>
+FramePackのデフォルト解像度は640x640です。各バケットのデフォルト解像度については、[ソースコード](../frame_pack/bucket_tools.py)を参照してください。
+画像データセットでの学習は行えません。また動画の長さによらず学習可能です。 `frame_extraction` に `full` を指定して、`max_frames` に十分に大きな値を指定してください。ただし、あまりにも長いとVAEのencodeでVRAMが不足する可能性があります。
+latentの事前キャッシングはFramePack専用のスクリプトを使用します。画像エンコーダーモデルを指定する必要があります。
+HunyuanVideoのキャッシングとの主な違いは次のとおりです。
+-  `fpack_cache_latents.py`を使用します。
+-  ダウンロードしたSigLIPモデルを指す`--image_encoder`引数が必要です。
+-  スクリプトは、各ビデオに対して複数のキャッシュファイルを生成します。各ファイルは異なるセクションに対応し、セクションインデックスがファイル名に追加されます（例：`..._frame_pos-0000-count_...`は`..._frame_pos-0000-0000-count_...`、`..._frame_pos-0000-0001-count_...`などになります）。
+-  画像埋め込みは画像エンコーダーを使用して計算され、latentとともにキャッシュファイルに保存されます。
+VAEのdecode時のVRAM節約のために、`--vae_chunk_size`と`--vae_spatial_tile_sample_min_size`を使用することを検討してください。VRAMがあふれて共有メモリを使用している場合には、`--vae_chunk_size`を16、8などに、`--vae_spatial_tile_sample_min_size`を64、32などに変更することをお勧めします。
+FramePack-F1の学習を行う場合は`--f1`を指定してください。これらのオプションの有無を変更する場合には、`--skip_existing`を指定せずに既存のキャッシュを上書きしてください。
+**FramePack-F1のサポート：**
+キャッシュ時のオプションに`--f1`を指定することで、FramePack-F1のサンプリング方法を適用できます。学習スクリプトについても`--f1`を指定してサンプル生成時のオプションを変更する必要があります。
+デフォルトでは、論文のサンプリング方法 Inverted anti-drifting （無印のFramePackの推論時と同じ、逆順の latent と index を使用）を使用します。`--f1`を指定すると FramePack-F1 の Vanilla sampling （時間順の latent と index を使用）に変更できます。
+</details>
+### Text Encoder Output Pre-caching / テキストエンコーダー出力の事前キャッシング
+Text encoder output pre-caching also uses a dedicated script.
+```bash
+python fpack_cache_text_encoder_outputs.py \
+    --dataset_config path/to/toml \
+    --text_encoder1 path/to/text_encoder1 \
+    --text_encoder2 path/to/text_encoder2 \
+    --batch_size 16
+```
+Key differences from HunyuanVideo caching:
+-   Uses `fpack_cache_text_encoder_outputs.py`.
+-   Requires both `--text_encoder1` (LLaMA) and `--text_encoder2` (CLIP) arguments.
+-   Uses `--fp8_llm` option to run the LLaMA Text Encoder 1 in fp8 mode for VRAM savings (similar to `--fp8_t5` in Wan2.1).
+-   Saves LLaMA embeddings, attention mask, and CLIP pooler output to the cache file.
+<details>
+<summary>日本語</summary>
+テキストエンコーダー出力の事前キャッシングも専用のスクリプトを使用します。
+HunyuanVideoのキャッシングとの主な違いは次のとおりです。
+-  `fpack_cache_text_encoder_outputs.py`を使用します。
+- LLaMAとCLIPの両方の引数が必要です。
+-  LLaMAテキストエンコーダー1をfp8モードで実行するための`--fp8_llm`オプションを使用します（Wan2.1の`--fp8_t5`に似ています）。
+-  LLaMAの埋め込み、アテンションマスク、CLIPのプーラー出力をキャッシュファイルに保存します。
+</details>
+## Training / 学習
+### Training
+Training uses a dedicated script `fpack_train_network.py`. Remember FramePack only supports I2V training.
+```bash
+accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 fpack_train_network.py \
+    --dit path/to/dit_model \
+    --vae path/to/vae_model.safetensors \
+    --text_encoder1 path/to/text_encoder1 \
+    --text_encoder2 path/to/text_encoder2 \
+    --image_encoder path/to/image_encoder_model.safetensors \
+    --dataset_config path/to/toml \
+    --sdpa --mixed_precision bf16 \
+    --optimizer_type adamw8bit --learning_rate 2e-4 --gradient_checkpointing \
+    --timestep_sampling shift --weighting_scheme none --discrete_flow_shift 3.0 \
+    --max_data_loader_n_workers 2 --persistent_data_loader_workers \
+    --network_module networks.lora_framepack --network_dim 32 \
+    --max_train_epochs 16 --save_every_n_epochs 1 --seed 42 \
+    --output_dir path/to/output_dir --output_name name-of-lora
+```
+If you use the command prompt (Windows, not PowerShell), you may need to write them in a single line, or use `^` instead of `\` at the end of each line to continue the command.
+The maximum value for `--blocks_to_swap` is 36. The default resolution for FramePack is 640x640, which requires around 17GB of VRAM. If you run out of VRAM, consider lowering the dataset resolution.
+Key differences from HunyuanVideo training:
+-   Uses `fpack_train_network.py`.
+- `--f1` option is available for FramePack-F1 model training. You need to specify the FramePack-F1 model as `--dit`. This option only changes the sample generation during training. The training process itself is the same as the original FramePack model.
+-   **Requires** specifying `--vae`, `--text_encoder1`, `--text_encoder2`, and `--image_encoder`.
+-   **Requires** specifying `--network_module networks.lora_framepack`.
+-  Optional `--latent_window_size` argument (default 9, should match caching).
+-   Memory saving options like `--fp8` (for DiT) and `--fp8_llm` (for Text Encoder 1) are available. `--fp8_scaled` is recommended when using `--fp8` for DiT.
+-   `--vae_chunk_size` and `--vae_spatial_tile_sample_min_size` options are available for the VAE to prevent out-of-memory during sampling (similar to caching).
+-  `--gradient_checkpointing` is available for memory savings.
+- If you encounter an error when the batch size is greater than 1 (especially when specifying `--sdpa` or `--xformers`, it will always result in an error), please specify `--split_attn`.
+<!-- -   Use `convert_lora.py` for converting the LoRA weights after training, similar to HunyuanVideo. -->
+Training settings (learning rate, optimizers, etc.) are experimental. Feedback is welcome.
+<details>
+<summary>日本語</summary>
+FramePackの学習は専用のスクリプト`fpack_train_network.py`を使用します。FramePackはI2V学習のみをサポートしています。
+コマンド記述例は英語版を参考にしてください。WindowsでPowerShellではなくコマンドプロンプトを使用している場合、コマンドを1行で記述するか、各行の末尾に`\`の代わりに`^`を付けてコマンドを続ける必要があります。
+`--blocks_to_swap`の最大値は36です。FramePackのデフォルト解像度（640x640）では、17GB程度のVRAMが必要です。VRAM容量が不足する場合は、データセットの解像度を下げてください。
+HunyuanVideoの学習との主な違いは次のとおりです。
+-  `fpack_train_network.py`を使用します。
+- FramePack-F1モデルの学習時には`--f1`を指定してください。この場合、`--dit`にFramePack-F1モデルを指定する必要があります。このオプションは学習時のサンプル生成時のみに影響し、学習プロセス自体は元のFramePackモデルと同じです。
+-  `--vae`、`--text_encoder1`、`--text_encoder2`、`--image_encoder`を指定する必要があります。
+-  `--network_module networks.lora_framepack`を指定する必要があります。
+-  必要に応じて`--latent_window_size`引数（デフォルト9）を指定できます（キャッシング時と一致させる必要があります）。
+-  `--fp8`（DiT用）や`--fp8_llm`（テキストエンコーダー1用）などのメモリ節約オプションが利用可能です。`--fp8_scaled`を使用することをお勧めします。
+-  サンプル生成時にメモリ不足を防ぐため、VAE用の`--vae_chunk_size`、`--vae_spatial_tile_sample_min_size`オプションが利用可能です（キャッシング時と同様）。
+-  メモリ節約のために`--gradient_checkpointing`が利用可能です。
+- バッチサイズが1より大きい場合にエラーが出た時には（特に`--sdpa`や`--xformers`を指定すると必ずエラーになります。）、`--split_attn`を指定してください。
+</details>
+## Inference
+Inference uses a dedicated script `fpack_generate_video.py`.
+```bash
+python fpack_generate_video.py \
+    --dit path/to/dit_model \
+    --vae path/to/vae_model.safetensors \
+    --text_encoder1 path/to/text_encoder1 \
+    --text_encoder2 path/to/text_encoder2 \
+    --image_encoder path/to/image_encoder_model.safetensors \
+    --image_path path/to/start_image.jpg \
+    --prompt "A cat walks on the grass, realistic style." \
+    --video_size 512 768 --video_seconds 5 --fps 30 --infer_steps 25 \
+    --attn_mode sdpa --fp8_scaled \
+    --vae_chunk_size 32 --vae_spatial_tile_sample_min_size 128 \
+    --save_path path/to/save/dir --output_type both \
+    --seed 1234 --lora_multiplier 1.0 --lora_weight path/to/lora.safetensors
+```
+<!-- --embedded_cfg_scale 10.0 --guidance_scale 1.0 \ -->
+Key differences from HunyuanVideo inference:
+-   Uses `fpack_generate_video.py`.
+- `--f1` option is available for FramePack-F1 model inference (forward generation). You need to specify the FramePack-F1 model as `--dit`.
+-   **Requires** specifying `--vae`, `--text_encoder1`, `--text_encoder2`, and `--image_encoder`.
+-   **Requires** specifying `--image_path` for the starting frame.
+-   **Requires** specifying `--video_seconds` or `--video_sections`. `--video_seconds` specifies the length of the video in seconds, while `--video_sections` specifies the number of sections. If `--video_sections` is specified, `--video_seconds` is ignored.
+- `--video_size` is the size of the generated video, height and width are specified in that order.
+-   `--prompt`: Prompt for generation.
+-  Optional `--latent_window_size` argument (default 9, should match caching and training).
+-  `--fp8_scaled` option is available for DiT to reduce memory usage. Quality may be slightly lower. `--fp8_llm` option is available to reduce memory usage of Text Encoder 1. `--fp8` alone is also an option for DiT but `--fp8_scaled` potentially offers better quality.
+-   LoRA loading options (`--lora_weight`, `--lora_multiplier`, `--include_patterns`, `--exclude_patterns`) are available. `--lycoris` is also supported.
+-   `--embedded_cfg_scale` (default 10.0) controls the distilled guidance scale.
+-   `--guidance_scale` (default 1.0) controls the standard classifier-free guidance scale. **Changing this from 1.0 is generally not recommended for the base FramePack model.**
+-   `--guidance_rescale` (default 0.0) is available but typically not needed.
+-   `--bulk_decode` option can decode all frames at once, potentially faster but uses more VRAM during decoding. `--vae_chunk_size` and `--vae_spatial_tile_sample_min_size` options are recommended to prevent out-of-memory errors.
+-   `--sample_solver` (default `unipc`) is available but only `unipc` is implemented.
+-   `--save_merged_model` option is available to save the DiT model after merging LoRA weights. Inference is skipped if this is specified.
+- `--latent_paddings` option overrides the default padding for each section. Specify it as a comma-separated list of integers, e.g., `--latent_paddings 0,0,0,0`. This option is ignored if `--f1` is specified.
+- `--custom_system_prompt` option overrides the default system prompt for the LLaMA Text Encoder 1. Specify it as a string. See [here](../hunyuan_model/text_encoder.py#L152) for the default system prompt.
+- `--rope_scaling_timestep_threshold` option is the RoPE scaling timestep threshold, default is None (disabled). If set, RoPE scaling is applied only when the timestep exceeds the threshold. Start with around 800 and adjust as needed. This option is intended for one-frame inference and may not be suitable for other cases.
+- `--rope_scaling_factor` option is the RoPE scaling factor, default is 0.5, assuming a resolution of 2x. For 1.5x resolution, around 0.7 is recommended.
+Other options like `--video_size`, `--fps`, `--infer_steps`, `--save_path`, `--output_type`, `--seed`, `--attn_mode`, `--blocks_to_swap`, `--vae_chunk_size`, `--vae_spatial_tile_sample_min_size` function similarly to HunyuanVideo/Wan2.1 where applicable.
+`--output_type` supports `latent_images` in addition to the options available in HunyuanVideo/Wan2.1. This option saves the latent and image files in the specified directory.
+The LoRA weights that can be specified in `--lora_weight` are not limited to the FramePack weights trained in this repository. You can also specify the HunyuanVideo LoRA weights from this repository and the HunyuanVideo LoRA weights from diffusion-pipe (automatic detection).
+The maximum value for `--blocks_to_swap` is 38.
+<details>
+<summary>日本語</summary>
+FramePackの推論は専用のスクリプト`fpack_generate_video.py`を使用します。コマンド記述例は英語版を参考にしてください。
+HunyuanVideoの推論との主な違いは次のとおりです。
+-  `fpack_generate_video.py`を使用します。
+- `--f1`を指定すると、FramePack-F1モデルの推論を行います（順方向で生成）。`--dit`にFramePack-F1モデルを指定する必要があります。
+-  `--vae`、`--text_encoder1`、`--text_encoder2`、`--image_encoder`を指定する必要があります。
+-  `--image_path`を指定する必要があります（開始フレーム）。
+-  `--video_seconds` または `--video_sections` を指定する必要があります。`--video_seconds`は秒単位でのビデオの長さを指定し、`--video_sections`はセクション数を指定します。`--video_sections`を指定した場合、`--video_seconds`は無視されます。
+-  `--video_size`は生成するビデオのサイズで、高さと幅をその順番で指定します。
+-   `--prompt`: 生成用のプロンプトです。
+-  必要に応じて`--latent_window_size`引数（デフォルト9）を指定できます（キャッシング時、学習時と一致させる必要があります）。
+- DiTのメモリ使用量を削減するために、`--fp8_scaled`オプションを指定可能です。品質はやや低下する可能性があります。またText Encoder 1のメモリ使用量を削減するために、`--fp8_llm`オプションを指定可能です。DiT用に`--fp8`単独のオプションも用意されていますが、`--fp8_scaled`の方が品質が良い可能性があります。
+-  LoRAの読み込みオプション（`--lora_weight`、`--lora_multiplier`、`--include_patterns`、`--exclude_patterns`）が利用可能です。LyCORISもサポートされています。
+-  `--embedded_cfg_scale`（デフォルト10.0）は、蒸留されたガイダンススケールを制御します。通常は変更しないでください。
+-  `--guidance_scale`（デフォルト1.0）は、標準の分類器フリーガイダンススケールを制御します。**FramePackモデルのベースモデルでは、通常1.0から変更しないことをお勧めします。**
+-  `--guidance_rescale`（デフォルト0.0）も利用可能ですが、通常は必要ありません。
+-  `--bulk_decode`オプションは、すべてのフレームを一度にデコードできるオプションです。高速ですが、デコード中にVRAMを多く使用します。VRAM不足エラーを防ぐために、`--vae_chunk_size`と`--vae_spatial_tile_sample_min_size`オプションを指定することをお勧めします。
+-  `--sample_solver`（デフォルト`unipc`）は利用可能ですが、`unipc`のみが実装されています。
+-  `--save_merged_model`オプションは、LoRAの重みをマージした後にDiTモデルを保存するためのオプションです。これを指定すると推論はスキップされます。
+- `--latent_paddings`オプションは、各セクションのデフォルトのパディングを上書きします。カンマ区切りの整数リストとして指定します。例：`--latent_paddings 0,0,0,0`。`--f1`を指定した場合は無視されます。
+- `--custom_system_prompt`オプションは、LLaMA Text Encoder 1のデフォルトのシステムプロンプトを上書きします。文字列として指定します。デフォルトのシステムプロンプトは[こちら](../hunyuan_model/text_encoder.py#L152)を参照してください。
+- `--rope_scaling_timestep_threshold`オプションはRoPEスケーリングのタイムステップ閾値で、デフォルトはNone（無効）です。設定すると、タイムステップが閾値以上の場合にのみRoPEスケーリングが適用されます。800程度から初めて調整してください。1フレーム推論時での使用を想定しており、それ以外の場合は想定していません。
+- `--rope_scaling_factor`オプションはRoPEスケーリング係数で、デフォルトは0.5で、解像度が2倍の場合を想定しています。1.5倍なら0.7程度が良いでしょう。
+`--video_size`、`--fps`、`--infer_steps`、`--save_path`、`--output_type`、`--seed`、`--attn_mode`、`--blocks_to_swap`、`--vae_chunk_size`、`--vae_spatial_tile_sample_min_size`などの他のオプションは、HunyuanVideo/Wan2.1と同様に機能します。
+`--lora_weight`に指定できるLoRAの重みは、当リポジトリで学習したFramePackの重み以外に、当リポジトリのHunyuanVideoのLoRA、diffusion-pipeのHunyuanVideoのLoRAが指定可能です（自動判定）。
+`--blocks_to_swap`の最大値は38です。
+</details>
+## Batch and Interactive Modes / バッチモードとインタラクティブモード
+In addition to single video generation, FramePack now supports batch generation from file and interactive prompt input:
+### Batch Mode from File / ファイルからのバッチモード
+Generate multiple videos from prompts stored in a text file:
+```bash
+python fpack_generate_video.py --from_file prompts.txt
+--dit path/to/dit_model --vae path/to/vae_model.safetensors
+--text_encoder1 path/to/text_encoder1 --text_encoder2 path/to/text_encoder2
+--image_encoder path/to/image_encoder_model.safetensors --save_path output_directory
+```
+The prompts file format:
+- One prompt per line
+- Empty lines and lines starting with # are ignored (comments)
+- Each line can include prompt-specific parameters using command-line style format:
+```
+A beautiful sunset over mountains --w 832 --h 480 --f 5 --d 42 --s 20 --i path/to/start_image.jpg
+A busy city street at night --w 480 --h 832 --i path/to/another_start.jpg
+```
+Supported inline parameters (if omitted, default values from the command line are used):
+- `--w`: Width
+- `--h`: Height
+- `--f`: Video seconds
+- `--d`: Seed
+- `--s`: Inference steps
+- `--g` or `--l`: Guidance scale
+- `--i`: Image path (for start image)
+- `--im`: Image mask path
+- `--n`: Negative prompt
+- `--vs`: Video sections
+- `--ei`: End image path
+- `--ci`: Control image path (explained in one-frame inference documentation)
+- `--cim`: Control image mask path (explained in one-frame inference documentation)
+- `--of`: One frame inference mode options (same as `--one_frame_inference` in the command line), options for one-frame inference
+In batch mode, models are loaded once and reused for all prompts, significantly improving overall generation time compared to multiple single runs.
+### Interactive Mode / インタラクティブモード
+Interactive command-line interface for entering prompts:
+```bash
+python fpack_generate_video.py --interactive
+--dit path/to/dit_model --vae path/to/vae_model.safetensors
+--text_encoder1 path/to/text_encoder1 --text_encoder2 path/to/text_encoder2
+--image_encoder path/to/image_encoder_model.safetensors --save_path output_directory
+```
+In interactive mode:
+- Enter prompts directly at the command line
+- Use the same inline parameter format as batch mode
+- Use Ctrl+D (or Ctrl+Z on Windows) to exit
+- Models remain loaded between generations for efficiency
+<details>
+<summary>日本語</summary>
+単一動画の生成に加えて、FramePackは現在、ファイルからのバッチ生成とインタラクティブなプロンプト入力をサポートしています。
+#### ファイルからのバッチモード
+テキストファイルに保存されたプロンプトから複数の動画を生成します：
+```bash
+python fpack_generate_video.py --from_file prompts.txt
+--dit path/to/dit_model --vae path/to/vae_model.safetensors
+--text_encoder1 path/to/text_encoder1 --text_encoder2 path/to/text_encoder2
+--image_encoder path/to/image_encoder_model.safetensors --save_path output_directory
+```
+プロンプトファイルの形式（サンプルは英語ドキュメントを参照）：
+- 1行に1つのプロンプト
+- 空行や#で始まる行は無視されます（コメント）
+- 各行にはコマンドライン形式でプロンプト固有のパラメータを含めることができます：
+サポートされているインラインパラメータ（省略した場合、コマンドラインのデフォルト値が使用されます）
+- `--w`: 幅
+- `--h`: 高さ
+- `--f`: 動画の秒数
+- `--d`: シード
+- `--s`: 推論ステップ
+- `--g` または `--l`: ガイダンススケール
+- `--i`: 画像パス（開始画像用）
+- `--im`: 画像マスクパス
+- `--n`: ネガティブプロンプト
+- `--vs`: 動画セクション数
+- `--ei`: 終了画像パス
+- `--ci`: 制御画像パス（1フレーム推論のドキュメントで解説）
+- `--cim`: 制御画像マスクパス（1フレーム推論のドキュメントで解説）
+- `--of`: 1フレーム推論モードオプション（コマンドラインの`--one_frame_inference`と同様、1フレーム推論のオプション）
+バッチモードでは、モデルは一度だけロードされ、すべてのプロンプトで再利用されるため、複数回の単一実行と比較して全体的な生成時間が大幅に改善されます。
+#### インタラクティブモード
+プロンプトを入力するためのインタラクティブなコマンドラインインターフェース：
+```bash
+python fpack_generate_video.py --interactive
+--dit path/to/dit_model --vae path/to/vae_model.safetensors
+--text_encoder1 path/to/text_encoder1 --text_encoder2 path/to/text_encoder2
+--image_encoder path/to/image_encoder_model.safetensors --save_path output_directory
+```
+インタラクティブモードでは：
+- コマンドラインで直接プロンプトを入力
+- バッチモードと同じインラインパラメータ形式を使用
+- 終了するには Ctrl+D (Windowsでは Ctrl+Z) を使用
+- 効率のため、モデルは生成間で読み込まれたままになります
+</details>
+## Advanced Video Control Features (Experimental) / 高度なビデオ制御機能（実験的）
+This section describes experimental features added to the `fpack_generate_video.py` script to provide finer control over the generated video content, particularly useful for longer videos or sequences requiring specific transitions or states. These features leverage the Inverted Anti-drifting sampling method inherent to FramePack.
+### **1. End Image Guidance (`--end_image_path`)**
+*   **Functionality:** Guides the generation process to make the final frame(s) of the video resemble a specified target image.
+*   **Usage:** `--end_image_path <path_to_image_file>`
+*   **Mechanism:** The provided image is encoded using the VAE. This latent representation is used as a target or starting point during the generation of the final video section (which is the first step in Inverted Anti-drifting).
+*   **Use Cases:** Defining a clear ending for the video, such as a character striking a specific pose or a product appearing in a close-up.
+This option is ignored if `--f1` is specified. The end image is not used in the FramePack-F1 model.
+### **2. Section Start Image Guidance (`--image_path` Extended Format)**
+*   **Functionality:** Guides specific sections within the video to start with a visual state close to a provided image.
+    * You can force the start image by setting `--latent_paddings` to `0,0,0,0` (specify the number of sections as a comma-separated list). If `latent_paddings` is set to 1 or more, the specified image will be used as a reference image (default behavior).
+*   **Usage:** `--image_path "SECTION_SPEC:path/to/image.jpg;;;SECTION_SPEC:path/to/another.jpg;;;..."`
+    *   `SECTION_SPEC`: Defines the target section(s). Rules:
+        *   `0`: The first section of the video (generated last in Inverted Anti-drifting).
+        *   `-1`: The last section of the video (generated first).
+        *   `N` (non-negative integer): The N-th section (0-indexed).
+        *   `-N` (negative integer): The N-th section from the end.
+        *   `S-E` (range, e.g., `0-2`): Applies the same image guidance to sections S through E (inclusive).
+    *   Use `;;;` as a separator between definitions.
+    *   If no image is specified for a section, generation proceeds based on the prompt and preceding (future time) section context.
+*   **Mechanism:** When generating a specific section, if a corresponding start image is provided, its VAE latent representation is strongly referenced as the "initial state" for that section. This guides the beginning of the section towards the specified image while attempting to maintain temporal consistency with the subsequent (already generated) section.
+*   **Use Cases:** Defining clear starting points for scene changes, specifying character poses or attire at the beginning of certain sections.
+### **3. Section-Specific Prompts (`--prompt` Extended Format)**
+*   **Functionality:** Allows providing different text prompts for different sections of the video, enabling more granular control over the narrative or action flow.
+*   **Usage:** `--prompt "SECTION_SPEC:Prompt text for section(s);;;SECTION_SPEC:Another prompt;;;..."`
+    *   `SECTION_SPEC`: Uses the same rules as `--image_path`.
+    *   Use `;;;` as a separator.
+    *   If a prompt for a specific section is not provided, the prompt associated with index `0` (or the closest specified applicable prompt) is typically used. Check behavior if defaults are critical.
+*   **Mechanism:** During the generation of each section, the corresponding section-specific prompt is used as the primary textual guidance for the model.
+*   **Prompt Content Recommendation** when using `--latent_paddings 0,0,0,0` without `--f1` (original FramePack model):
+    *   Recall that FramePack uses Inverted Anti-drifting and references future context.
+    *   It is recommended to describe "**the main content or state change that should occur in the current section, *and* the subsequent events or states leading towards the end of the video**" in the prompt for each section.
+    *   Including the content of subsequent sections in the current section's prompt helps the model maintain context and overall coherence.
+    *   Example: For section 1, the prompt might describe what happens in section 1 *and* briefly summarize section 2 (and beyond).
+    *   However, based on observations (e.g., the `latent_paddings` comment), the model's ability to perfectly utilize very long-term context might be limited. Experimentation is key. Describing just the "goal for the current section" might also work. Start by trying the "section and onwards" approach.
+* Use the default prompt when `latent_paddings` is >= 1 or `--latent_paddings` is not specified, or when using `--f1` (FramePack-F1 model).
+*   **Use Cases:** Describing evolving storylines, gradual changes in character actions or emotions, step-by-step processes over time.
+### **Combined Usage Example** (with `--f1` not specified)
+Generating a 3-section video of "A dog runs towards a thrown ball, catches it, and runs back":
+```bash
+python fpack_generate_video.py \
+ --prompt "0:A dog runs towards a thrown ball, catches it, and runs back;;;1:The dog catches the ball and then runs back towards the viewer;;;2:The dog runs back towards the viewer holding the ball" \
+ --image_path "0:./img_start_running.png;;;1:./img_catching.png;;;2:./img_running_back.png" \
+ --end_image_path ./img_returned.png \
+ --save_path ./output \
+ # ... other arguments
+```
+*   **Generation Order:** Section 2 -> Section 1 -> Section 0
+*   **Generating Section 2:**
+    *   Prompt: "The dog runs back towards the viewer holding the ball"
+    *   Start Image: `./img_running_back.png`
+    *   End Image: `./img_returned.png` (Initial target)
+*   **Generating Section 1:**
+    *   Prompt: "The dog catches the ball and then runs back towards the viewer"
+    *   Start Image: `./img_catching.png`
+    *   Future Context: Generated Section 2 latent
+*   **Generating Section 0:**
+    *   Prompt: "A dog runs towards a thrown ball, catches it, and runs back"
+    *   Start Image: `./img_start_running.png`
+    *   Future Context: Generated Section 1 & 2 latents
+### **Important Considerations**
+*   **Inverted Generation:** Always remember that generation proceeds from the end of the video towards the beginning. Section `-1` (the last section, `2` in the example) is generated first.
+*   **Continuity vs. Guidance:** While start image guidance is powerful, drastically different images between sections might lead to unnatural transitions. Balance guidance strength with the need for smooth flow.
+*   **Prompt Optimization:** The prompt content recommendation is a starting point. Fine-tune prompts based on observed model behavior and desired output quality.
+<details>
+<summary>日本語</summary>
+### **高度な動画制御機能（実験的）**
+このセクションでは、`fpack_generate_video.py` スクリプトに追加された実験的な機能について説明します。これらの機能は、生成される動画の内容をより詳細に制御するためのもので、特に長い動画や特定の遷移・状態が必要なシーケンスに役立ちます。これらの機能は、FramePack固有のInverted Anti-driftingサンプリング方式を活用しています。
+#### **1. 終端画像ガイダンス (`--end_image_path`)**
+*   **機能:** 動画の最後のフレーム（群）を指定したターゲット画像に近づけるように生成を誘導します。
+*   **書式:** `--end_image_path <画像ファイルパス>`
+*   **動作:** 指定された画像はVAEでエンコードされ、その潜在表現が動画の最終セクション（Inverted Anti-driftingでは最初に生成される）の生成時の目標または開始点として使用されます。
+*   **用途:** キャラクターが特定のポーズで終わる、特定の商品がクローズアップで終わるなど、動画の結末を明確に定義する場合。
+このオプションは、`--f1`を指定した場合は無視されます。FramePack-F1モデルでは終端画像は使用されません。
+#### **2. セクション開始画像ガイダンス (`--image_path` 拡張書式)**
+*   **機能:** 動画内の特定のセクションが、指定された画像に近い視覚状態から始まるように誘導します。
+    * `--latent_paddings`を`0,0,0,0`（カンマ区切りでセクション数だけ指定）に設定することで、セクションの開始画像を強制できます。`latent_paddings`が1以上の場合、指定された画像は参照画像として使用されます。
+*   **書式:** `--image_path "セクション指定子:画像パス;;;セクション指定子:別の画像パス;;;..."`
+    *   `セクション指定子`: 対象セクションを定義します。ルール：
+        *   `0`: 動画の最初のセクション（Inverted Anti-driftingでは最後に生成）。
+        *   `-1`: 動画の最後のセクション（最初に生成）。
+        *   `N`（非負整数）: N番目のセクション（0始まり）。
+        *   `-N`（負整数）: 最後からN番目のセクション。
+        *   `S-E`（範囲, 例:`0-2`）: セクションSからE（両端含む）に同じ画像を適用。
+    *   区切り文字は `;;;` です。
+    *   セクションに画像が指定されていない場合、プロンプトと後続（未来時刻）セクションのコンテキストに基づいて生成されます。
+*   **動作:** 特定セクションの生成時、対応する開始画像が指定されていれば、そのVAE潜在表現がそのセクションの「初期状態」として強く参照されます。これにより、後続（生成済み）セクションとの時間的連続性を維持しようとしつつ、セクションの始まりを指定画像に近づけます。
+*   **用途:** シーン変更の起点を明確にする、特定のセクション開始時のキャラクターのポーズや服装を指定するなど。
+#### **3. セクション別プロンプト (`--prompt` 拡張書式)**
+*   **機能:** 動画のセクションごとに異なるテキストプロンプトを与え、物語やアクションの流れをより細かく指示できます。
+*   **書式:** `--prompt "セクション指定子:プロンプトテキスト;;;セクション指定子:別のプロンプト;;;..."`
+    *   `セクション指定子`: `--image_path` と同じルールです。
+    *   区切り文字は `;;;` です。
+    *   特定セクションのプロンプトがない場合、通常はインデックス`0`に関連付けられたプロンプト（または最も近い適用可能な指定プロンプト）が使用されます。デフォルトの挙動が重要な場合は確認してくだ��い。
+*   **動作:** 各セクションの生成時、対応するセクション別プロンプトがモデルへの主要なテキスト指示として使用されます。
+*  `latent_paddings`に`0`を指定した場合（非F1モデル）の **プロンプト内容の推奨:**
+    *   FramePackはInverted Anti-driftingを採用し、未来のコンテキストを参照することを思い出してください。
+    *   各セクションのプロンプトには、「**現在のセクションで起こるべき主要な内容や状態変化、*および*それに続く動画の終端までの内容**」を記述することを推奨します。
+    *   現在のセクションのプロンプトに後続セクションの内容を含めることで、モデルが全体的な文脈を把握し、一貫性を保つのに役立ちます。
+    *   例：セクション1のプロンプトには、セクション1の内容 *と* セクション2の簡単な要約を記述します。
+    *   ただし、モデルの長期コンテキスト完全利用能力には限界がある可能性も示唆されています（例：`latent_paddings`コメント）。実験が鍵となります。「現在のセクションの目標」のみを記述するだけでも機能する場合があります。まずは「セクションと以降」アプローチを試すことをお勧めします。
+* 使用するプロンプトは、`latent_paddings`が`1`以上または指定されていない場合、または`--f1`（FramePack-F1モデル）を使用している場合は、通常のプロンプト内容を記述してください。
+*   **用途:** 時間経過に伴うストーリーの変化、キャラクターの行動や感情の段階的な変化、段階的なプロセスなどを記述する場合。
+#### **組み合わせ使用例** （`--f1`未指定時）
+「投げられたボールに向かって犬が走り、それを捕まえ、走って戻ってくる」3セクション動画の生成：
+（コマンド記述例は英語版を参考にしてください）
+*   **生成順序:** セクション2 → セクション1 → セクション0
+*   **セクション2生成時:**
+    *   プロンプト: "犬がボールを咥えてこちらに向かって走ってくる"
+    *   開始画像: `./img_running_back.png`
+    *   終端画像: `./img_returned.png` （初期目標）
+*   **セクション1生成時:**
+    *   プロンプト: "犬がボールを捕まえ、その後こちらに向かって走ってくる"
+    *   開始画像: `./img_catching.png`
+    *   未来コンテキスト: 生成済みセクション2の潜在表現
+*   **セクション0生成時:**
+    *   プロンプト: "犬が投げられたボールに向かって走り、それを捕まえ、走って戻ってくる"
+    *   開始画像: `./img_start_running.png`
+    *   未来コンテキスト: 生成済みセクション1 & 2の潜在表現
+#### **重要な考慮事項**
+*   **逆順生成:** 生成は動画の終わりから始まりに向かって進むことを常に意識してください。セクション`-1`（最後のセクション、上の例では `2`）が最初に生成されます。
+*   **連続性とガイダンスのバランス:** 開始画像ガイダンスは強力ですが、セクション間で画像が大きく異なると、遷移が不自然になる可能性があります。ガイダンスの強さとスムーズな流れの必要性のバランスを取ってください。
+*   **プロンプトの最適化:** 推奨されるプロンプト内容はあくまでも参考です。モデルの観察された挙動と望ましい出力品質に基づいてプロンプトを微調整してください。
+</details>

docs/framepack_1f.md ADDED Viewed

	@@ -0,0 +1,359 @@

+# FramePack One Frame (Single Frame) Inference and Training / FramePack 1フレーム推論と学習
+## Overview / 概要
+This document explains advanced inference and training methods using the FramePack model, particularly focusing on **"1-frame inference"** and its extensions. These features aim to leverage FramePack's flexibility to enable diverse image generation and editing tasks beyond simple video generation.
+### The Concept and Development of 1-Frame Inference
+While FramePack is originally a model for generating sequential video frames (or frame sections), it was discovered that by focusing on its internal structure, particularly how it handles temporal information with RoPE (Rotary Position Embedding), interesting control over single-frame generation is possible.
+1.  **Basic 1-Frame Inference**:
+    *   It takes an initial image and a prompt as input, limiting the number of generated frames to just one.
+    *   In this process, by intentionally setting a large RoPE timestamp (`target_index`) for the single frame to be generated, a single static image can be obtained that reflects temporal and semantic changes from the initial image according to the prompt.
+    *   This utilizes FramePack's characteristic of being highly sensitive to RoPE timestamps, as it supports bidirectional contexts like "Inverted anti-drifting." This allows for operations similar to natural language-based image editing, albeit in a limited capacity, without requiring additional training.
+2.  **Kisekaeichi Method (Feature Merging via Post-Reference)**:
+    *   This method, an extension of basic 1-frame inference, was **proposed by furusu**. In addition to the initial image, it also uses a reference image corresponding to a "next section-start image" (treated as `clean_latent_post`) as input.
+    *   The RoPE timestamp (`target_index`) for the image to be generated is set to an intermediate value between the timestamps of the initial image and the section-end image.
+    *   More importantly, masking (e.g., zeroing out specific regions) is applied to the latent representation of each reference image. For example, by setting masks to extract a character's face and body shape from the initial image and clothing textures from the reference image, an image can be generated that fuses the desired features of both, similar to a character "dress-up" or outfit swapping. This method can also be fundamentally achieved without additional training.
+3.  **1f-mc (one frame multi-control) Method (Proximal Frame Blending)**:
+    *   This method was **proposed by mattyamonaca**. It takes two reference images as input: an initial image (e.g., at `t=0`) and a subsequent image (e.g., at `t=1`, the first frame of a section), and generates a single image blending their features.
+    *   Unlike Kisekaeichi, latent masking is typically not performed.
+    *   To fully leverage this method, additional training using LoRA (Low-Rank Adaptation) is recommended. Through training, the model can better learn the relationship and blending method between the two input images to achieve specific editing effects.
+### Integration into a Generalized Control Framework
+The concepts utilized in the methods above—specifying reference images, manipulating timestamps, and applying latent masks—have been generalized to create a more flexible control framework.
+Users can arbitrarily specify the following elements for both inference and LoRA training:
+*   **Control Images**: Any set of input images intended to influence the model.
+*   **Clean Latent Index (Indices)**: Timestamps corresponding to each control image. These are treated as `clean latent index` internally by FramePack and can be set to any position on the time axis. This is specified as `control_index`.
+*   **Latent Masks**: Masks applied to the latent representation of each control image, allowing selective control over which features from the control images are utilized. This is specified as `control_image_mask_path` or the alpha channel of the control image.
+*   **Target Index**: The timestamp for the single frame to be generated.
+This generalized control framework, along with corresponding extensions to the inference and LoRA training tools, has enabled advanced applications such as:
+*   Development of LoRAs that stabilize 1-frame inference effects (e.g., a camera orbiting effect) that were previously unstable with prompts alone.
+*   Development of Kisekaeichi LoRAs that learn to perform desired feature merging under specific conditions (e.g., ignoring character information from a clothing reference image), thereby automating the masking process through learning.
+These features maximize FramePack's potential and open up new creative possibilities in static image generation and editing. Subsequent sections will detail the specific options for utilizing these functionalities.
+<details>
+<summary>日本語</summary>
+このドキュメントでは、FramePackモデルを用いた高度な推論および学習手法、特に「1フレーム推論」��その拡張機能について解説します。これらの機能は、FramePackの柔軟性を活かし、動画生成に留まらない多様な画像生成・編集タスクを実現することを目的としています。
+### 1フレーム推論の発想と発展
+FramePackは本来、連続する動画フレーム（またはフレームセクション）を生成するモデルですが、その内部構造、特に時間情報を扱うRoPE (Rotary Position Embedding) の扱いに着目することで、単一フレームの生成においても興味深い制御が可能になることが発見されました。
+1.  **基本的な1フレーム推論**:
+    *   開始画像とプロンプトを入力とし、生成するフレーム数を1フレームに限定します。
+    *   この際、生成する1フレームに割り当てるRoPEのタイムスタンプ（`target_index`）を意図的に大きな値に設定することで、開始画像からプロンプトに従って時間的・意味的に変化した単一の静止画を得ることができます。
+    *   これは、FramePackがInverted anti-driftingなどの双方向コンテキストに対応するため、RoPEのタイムスタンプに対して敏感に反応する特性を利用したものです。これにより、学習なしで限定的ながら自然言語による画像編集に近い操作が可能です。
+2.  **kisekaeichi方式 (ポスト参照による特徴マージ)**:
+    *   基本的な1フレーム推論を発展させたこの方式は、**furusu氏により提案されました**。開始画像に加え、「次のセクションの開始画像」に相当する参照画像（`clean_latent_post`として扱われる）も入力として利用します。
+    *   生成する画像のRoPEタイムスタンプ（`target_index`）を、開始画像のタイムスタンプとセクション終端画像のタイムスタンプの中間的な値に設定します。
+    *   さらに重要な点として、各参照画像のlatent表現に対してマスク処理（特定領域を0で埋めるなど）を施します。例えば、開始画像からはキャラクターの顔や体型を、参照画像からは服装のテクスチャを抽出するようにマスクを設定することで、キャラクターの「着せ替え」のような、両者の望ましい特徴を融合させた画像を生成できます。この手法も基本的には学習不要で実現可能です。
+3.  **1f-mc (one frame multi-control) 方式 (近接フレームブレンド)**:
+    *   この方式は、**mattyamonaca氏により提案されました**。開始画像（例: `t=0`）と、その直後の画像（例: `t=1`、セクションの最初のフレーム）の2つを参照画像として入力し、それらの特徴をブレンドした単一画像を生成します。
+    *   kisekaeichiとは異なり、latentマスクは通常行いません。
+    *   この方式の真価を発揮するには、LoRA (Low-Rank Adaptation) による追加学習が推奨されます。学習により、モデルは2つの入力画像間の関係性やブレンド方法をより適切に学習し、特定の編集効果を実現できます。
+### 汎用的な制御フレームワークへの統合
+上記の各手法で利用されていた「参照画像の指定」「タイムスタンプの操作」「latentマスクの適用」といった概念を一般化し、より柔軟な制御を可能にするための拡張が行われました。
+ユーザーは以下の要素を任意に指定して、推論およびLoRA学習を行うことができます。
+*   **制御画像 (Control Images)**: モデルに影響を与えるための任意の入力画像群。
+*   **Clean Latent Index (Indices)**: 各制御画像に対応するタイムスタンプ。FramePack内部の`clean latent index`として扱われ、時間軸上の任意の位置を指定可能です。`control_index`として指定します。
+*   **Latentマスク (Latent Masks)**: 各制御画像のlatentに適用するマスク。これにより、制御画像から利用する特徴を選択的に制御します。`control_image_mask_path`または制御画像のアルファチャンネルとして指定します。
+*   **Target Index**: 生成したい単一フレームのタイムスタンプ。
+この汎用的な制御フレームワークと、それに対応した推論ツールおよびLoRA学習ツールの拡張により、以下のような高度な応用が可能になりました。
+*   プロンプトだけでは不安定だった1フレーム推論の効果（例: カメラ旋回）を安定化させるLoRAの開発。
+*   マスク処理を手動で行う代わりに、特定の条件下（例: 服の参照画像からキャラクター情報を無視する）で望ましい特徴マージを行うように学習させたkisekaeichi LoRAの開発。
+これらの機能は、FramePackのポテンシャルを最大限に引き��し、静止画生成・編集における新たな創造の可能性を拓くものです。以降のセクションでは、これらの機能を実際に利用するための具体的なオプションについて説明します。
+</details>
+## One Frame (Single Frame) Training / 1フレーム学習
+**This feature is experimental.** It trains in the same way as one frame inference.
+The dataset must be an image dataset. If you use caption files, you need to specify `control_directory` and place the **start images** in that directory. The `image_directory` should contain the images after the change. The filenames of both directories must match. Caption files should be placed in the `image_directory`.
+If you use JSONL files, specify them as `{"image_path": "/path/to/target_image1.jpg", "control_path": "/path/to/source_image1.jpg", "caption": "The object changes to red."}`. The `image_path` should point to the images after the change, and `control_path` should point to the starting images.
+For the dataset configuration, see [here](../dataset/dataset_config.md#sample-for-image-dataset-with-control-images) and [here](../dataset/dataset_config.md#framepack-one-frame-training). There are also examples for kisekaeichi and 1f-mc settings.
+For single frame training, specify `--one_frame` in `fpack_cache_latents.py` to create the cache. You can also use `--one_frame_no_2x` and `--one_frame_no_4x` options, which have the same meaning as `no_2x` and `no_4x` during inference. It is recommended to set these options to match the inference settings.
+If you change whether to use one frame training or these options, please overwrite the existing cache without specifying `--skip_existing`.
+Specify `--one_frame` in `fpack_train_network.py` to change the inference method during sample generation.
+The optimal training settings are currently unknown. Feedback is welcome.
+### Example of prompt file description for sample generation
+The command line options `--one_frame_inference` corresponds to `--of`, and `--control_image_path` corresponds to `--ci`.
+Note that `--ci` can be specified multiple times, but `--control_image_path` is specified as `--control_image_path img1.png img2.png`, while `--ci` is specified as `--ci img1.png --ci img2.png`.
+Normal single frame training:
+```
+The girl wears a school uniform. --i path/to/start.png --ci path/to/start.png --of no_2x,no_4x,target_index=1,control_index=0 --d 1111 --f 1 --s 10 --fs 7 --d 1234 --w 384 --h 576
+```
+Kisekaeichi training:
+```
+The girl wears a school uniform. --i path/to/start_with_alpha.png --ci path/to/ref_with_alpha.png --ci path/to/start_with_alpha.png --of no_post,no_2x,no_4x,target_index=5,control_index=0;10 --d 1111 --f 1 --s 10 --fs 7 --d 1234 --w 384 --h 576
+```
+<details>
+<summary>日本語</summary>
+**この機能は実験的なものです。** 1フレーム推論と同様の方法で学習を行います。
+データセットは画像データセットである必要があります。キャプションファイルを用いる場合は、`control_directory`を追加で指定し、そのディレクトリに**開始画像**を格納してください。`image_directory`には変化後の画像を格納します。両者のファイル名は一致させる必要があります。キャプションファイルは`image_directory`に格納してください。
+JSONLファイルを用いる場合は、`{"image_path": "/path/to/target_image1.jpg", "control_path": "/path/to/source_image1.jpg", "caption": "The object changes to red"}`のように指定してください。`image_path`は変化後の画像、`control_path`は開始画像を指定します。
+データセットの設定については、[こちら](../dataset/dataset_config.md#sample-for-image-dataset-with-control-images)と[こちら](../dataset/dataset_config.md#framepack-one-frame-training)も参照してください。kisekaeichiと1f-mcの設定例もそちらにあります。
+1フレーム学習時は、`fpack_cache_latents.py`に`--one_frame`を指定してキャッシュを作成してください。また`--one_frame_no_2x`と`--one_frame_no_4x`オプションも利用可能です。推論時の`no_2x`、`no_4x`と同じ意味を持ちますので、推論時と同じ設定にすることをお勧めします。
+1フレーム学習か否かを変更する場合、またこれらのオプションを変更する場合は、`--skip_existing`を指定せずに既存のキャッシュを上書きしてください。
+また、`fpack_train_network.py`に`--one_frame`を指定してサンプル画像生成時の推論方法を変更してください。
+最適な学習設定は今のところ不明です。フィードバックを歓迎します。
+**サンプル生成のプロンプトファイル記述例**
+コマンドラインオプション`--one_frame_inference`に相当する `--of`と、`--control_image_path`に相当する`--ci`が用意されています。
+※ `--ci`は複数指定可能ですが、`--control_image_path`は`--control_image_path img1.png img2.png`のようにスペースで区切るのに対して、`--ci`は`--ci img1.png --ci img2.png`のように指定するので注意してください。
+通常の1フレーム学習:
+```
+The girl wears a school uniform. --i path/to/start.png --ci path/to/start.png --of no_2x,no_4x,target_index=1,control_index=0 --d 1111 --f 1 --s 10 --fs 7 --d 1234 --w 384 --h 576
+```
+kisekaeichi方式:
+```
+The girl wears a school uniform. --i path/to/start_with_alpha.png --ci path/to/ref_with_alpha.png --ci path/to/start_with_alpha.png --of no_post,no_2x,no_4x,target_index=5,control_index=0;10 --d 1111 --f 1 --s 10 --fs 7 --d 1234 --w 384 --h 576
+```
+</details>
+## One (single) Frame Inference / 1フレーム推論
+**This feature is highly experimental** and not officially supported. It is intended for users who want to explore the potential of FramePack for one frame inference, which is not a standard feature of the model.
+This script also allows for one frame inference, which is not an official feature of FramePack but rather a custom implementation.
+Theoretically, it generates an image after a specified time from the starting image, following the prompt. This means that, although limited, it allows for natural language-based image editing.
+To perform one frame inference, specify some option in the `--one_frame_inference` option. Here is an example:
+```bash
+--video_sections 1 --output_type latent_images --one_frame_inference default
+```
+The `--one_frame_inference` option is recommended to be set to `default` or `no_2x,no_4x`. If you specify `--output_type` as `latent_images`, both the latent and image will be saved.
+You can specify the following strings in the `--one_frame_inference` option, separated by commas:
+-   `no_2x`: Generates without passing clean latents 2x with zero vectors to the model. Slightly improves generation speed. The impact on generation results is unknown.
+-   `no_4x`: Generates without passing clean latents 4x with zero vectors to the model. Slightly improves generation speed. The impact on generation results is unknown.
+-   `no_post`: Generates without passing clean latents post with zero vectors to the model. Improves generation speed by about 20%, but may result in unstable generation.
+-   `target_index=<integer>`: Specifies the index of the image to be generated. The default is the last frame (i.e., `latent_window_size`).
+For example, you can use `--one_frame_inference default` to pass clean latents 2x, clean latents 4x, and post to the model. `--one_frame_inference no_2x,no_4x` if you want to skip passing clean latents 2x and 4x to the model. `--one_frame_inference target_index=9` can be used to specify the target index for the generated image.
+The `--one_frame_inference` option also supports advanced inference, which is described in the next section. This option allows for more detailed control using additional parameters like `target_index` and `control_index` within this option.
+Normally, specify `--video_sections 1` to indicate only one section (one image).
+Increasing `target_index` from the default of 9 may result in larger changes. It has been confirmed that generation can be performed without breaking up to around 40.
+The `--end_image_path` is ignored for one frame inference.
+<details>
+<summary>日本語</summary>
+**この機能は非常に実験的であり**、公式にはサポートされていません。FramePackを使用して1フレーム推論の可能性を試したいユーザーに向けたものです。
+このスクリプトでは、単一画像の推論を行うこともできます。FramePack公式の機能ではなく、独自の実装です。
+理論的には、開始画像から、プロンプトに従い、指定時間経過後の画像を生成します。つまり制限付きですが自然言語による画像編集を行うことができます。
+単一画像推論を行うには`--one_frame_inference`オプションに、何らかのオプションを指定してください。記述例は以下の通りです。
+```bash
+--video_sections 1 --output_type latent_images --one_frame_inference default
+```
+`--one_frame_inference`のオプションは、`default`または `no_2x,no_4x`を推奨します。`--output_type`に`latent_images`を指定するとlatentと画像の両方が保存されます。
+`--one_frame_inference`のオプションには、カンマ区切りで以下のオプションを任意個数指定できます。
+- `no_2x`: ゼロベクトルの clean latents 2xをモデルに渡さずに生成します。わずかに生成速度が向上します。生成結果への影響は不明です。
+- `no_4x`: ゼロベクトルの clean latents 4xをモデルに渡さずに生成します。わずかに生成速度が向上します。生成結果への影響は不明です。
+-  `no_post`: ゼロベクトルの clean latents の post を渡さずに生成します。生成速度が20%程度向上しますが、生成結果が不安定に���る場合があります。
+- `target_index=<整数>`: 生成する画像のindexを指定します。デフォルトは最後のフレームです（=latent_window_size）。
+たとえば、`--one_frame_inference default`を使用すると、clean latents 2x、clean latents 4x、postをモデルに渡します。`--one_frame_inference no_2x,no_4x`を使用すると、clean latents 2xと4xをモデルに渡すのをスキップします。`--one_frame_inference target_index=9`を使用して、生成する画像のターゲットインデックスを指定できます。
+後述の高度な推論では、このオプション内で `target_index`、`control_index` といった追加のパラメータを指定して、より詳細な制御が可能です。
+clean latents 2x、clean latents 4x、postをモデルに渡す場合でも値はゼロベクトルですが、値を渡すか否かで結果は変わります。特に`no_post`を指定すると、`latent_window_size`を大きくしたときに生成結果が不安定になる場合があります。
+通常は`--video_sections 1` として1セクションのみ（画像1枚）を指定してください。
+`target_index` をデフォルトの9から大きくすると、変化量が大きくなる可能性があります。40程度までは破綻なく生成されることを確認しています。
+`--end_image_path`は無視されます。
+</details>
+## kisekaeichi method (Post Reference Options) and 1f-mc (Multi-Control) / kisekaeichi方式（ポスト参照オプション）と1f-mc（マルチコントロール）
+The `kisekaeichi` method was proposed by furusu. The `1f-mc` method was proposed by mattyamonaca in pull request [#304](https://github.com/kohya-ss/musubi-tuner/pull/304).
+In this repository, these methods have been integrated and can be specified with the `--one_frame_inference` option. This allows for specifying any number of control images as clean latents, along with indices. This means you can specify multiple starting images and multiple clean latent posts. Additionally, masks can be applied to each image.
+It is expected to work only with FramePack (non-F1 model) and not with F1 models.
+The following options have been added to `--one_frame_inference`. These can be used in conjunction with existing flags like `target_index`, `no_post`, `no_2x`, and `no_4x`.
+-   `control_index=<integer_or_semicolon_separated_integers>`: Specifies the index(es) of the clean latent for the control image(s). You must specify the same number of indices as the number of control images specified with `--control_image_path`.
+Additionally, the following command-line options have been added. These arguments are only valid when `--one_frame_inference` is specified.
+-   `--control_image_path <path1> [<path2> ...]` : Specifies the path(s) to control (reference) image(s) for one frame inference. Provide one or more paths separated by spaces. Images with an alpha channel can be specified. If an alpha channel is present, it is used as a mask for the clean latent.
+-   `--control_image_mask_path <path1> [<path2> ...]` : Specifies the path(s) to grayscale mask(s) to be applied to the control image(s). Provide one or more paths separated by spaces. Each mask is applied to the corresponding control image. The 255 areas are referenced, while the 0 areas are ignored.
+**Example of specifying kisekaeichi:**
+The kisekaeichi method works without training, but using a dedicated LoRA may yield better results.
+```bash
+--video_sections 1 --output_type latent_images --image_path start_image.png --control_image_path start_image.png clean_latent_post_image.png \
+--one_frame_inference target_index=1,control_index=0;10,no_post,no_2x,no_4x --control_image_mask_path ctrl_mask1.png ctrl_mask2.png
+```
+In this example, `start_image.png` (for `clean_latent_pre`) and `clean_latent_post_image.png` (for `clean_latent_post`) are the reference images. The `target_index` specifies the index of the generated image. The `control_index` specifies the clean latent index for each control image, so it will be `0;10`. The masks for the control images are specified with `--control_image_mask_path`.
+The optimal values for `target_index` and `control_index` are unknown. The `target_index` should be specified as 1 or higher. The `control_index` should be set to an appropriate value relative to `latent_window_size`. Specifying 1 for `target_index` results in less change from the starting image, but may introduce noise. Specifying 9 or 13 may reduce noise but result in larger changes from the original image.
+The `control_index` should be larger than `target_index`. Typically, it is set to `10`, but larger values (e.g., around `13-16`) may also work.
+Sample images and command lines for reproduction are as follows:
+```bash
+python fpack_generate_video.py --video_size 832 480 --video_sections 1 --infer_steps 25 \
+    --prompt "The girl in a school blazer in a classroom." --save_path path/to/output --output_type latent_images \
+    --dit path/to/dit --vae path/to/vae --text_encoder1 path/to/text_encoder1 --text_encoder2 path/to/text_encoder2 \
+    --image_encoder path/to/image_encoder --attn_mode sdpa --vae_spatial_tile_sample_min_size 128 --vae_chunk_size 32 \
+    --image_path path/to/kisekaeichi_start.png --control_image_path path/to/kisekaeichi_start.png path/to/kisekaeichi_ref.png
+    --one_frame_inference target_index=1,control_index=0;10,no_2x,no_4x,no_post
+    --control_image_mask_path path/to/kisekaeichi_start_mask.png path/to/kisekaeichi_ref_mask.png --seed 1234
+```
+Specify `--fp8_scaled` and `--blocks_to_swap` options according to your VRAM capacity.
+- [kisekaeichi_start.png](./kisekaeichi_start.png)
+- [kisekaeichi_ref.png](./kisekaeichi_ref.png)
+- [kisekaeichi_start_mask.png](./kisekaeichi_start_mask.png)
+- [kisekaeichi_ref_mask.png](./kisekaeichi_ref_mask.png)
+Generation result: [kisekaeichi_result.png](./kisekaeichi_result.png)
+**Example of 1f-mc (Multi-Control):**
+```bash
+--video_sections 1 --output_type latent_images --image_path start_image.png --control_image_path start_image.png 2nd_image.png \
+--one_frame_inference target_index=9,control_index=0;1,no_2x,no_4x
+```
+In this example, `start_image.png` is the starting image, and `2nd_image.png` is the reference image. The `target_index=9` specifies the index of the generated image, while `control_index=0;1` specifies the clean latent indices for each control image.
+1f-mc is intended to be used in combination with a trained LoRA, so adjust `target_index` and `control_index` according to the LoRA's description.
+<details>
+<summary>日本語</summary>
+`kisekaeichi`方式はfurusu氏により提案されました。また`1f-mc`方式はmattyamonaca氏によりPR [#304](https://github.com/kohya-ss/musubi-tuner/pull/304) で提案されました。
+当リポジトリではこれらの方式を統合し、`--one_frame_inference`オプションで指定できるようにしました。これにより、任意の枚数の制御用画像を clean latentとして指定し、さらにインデックスを指定できます。つまり開始画像の複数枚指定やclean latent postの複数枚指定などが可能です。また、それぞれの画像にマスクを適用することもできます。
+なお、FramePack無印のみ動作し、F1モデルでは動作しないと思われます。
+`--one_frame_inference`に以下のオプションが追加されています。`target_index`、`no_post`、`no_2x`や`no_4x`など既存のフラグと併用できます。
+- `control_index=<整数またはセミコロン区切りの整数>`: 制御用画像のclean latentのインデックスを指定します。`--control_image_path`で指定した制御用画像の数と同じ数のインデックスを指定してください。
+またコマンドラインオプションに以下が追加されています。これらの引数は`--one_frame_inference`を指定した場合のみ有効です。
+- `--control_image_path <パス1> [<パス2> ...]` : 1フレーム推論用の制御用（参照）画像のパスを1つ以上、スペース区切りで指定します。アルファチャンネルを持つ画像が指定可能です。アルファチャンネルがある場合は、clean latentへのマスクとして利用されます。
+- `--control_image_mask_path <パス1> [<パス2> ...]` : 制御用画像に適用するグレースケールマスクのパスを1つ以上、スペース区切りで指定します。各マスクは対応する制御用画像に適用されます。255の部分が参照される部分、0の部分が無視される部分です。
+**kisekaeichiの指定例**:
+kisekaeichi方式は学習なしでも動作しますが、専用のLoRAを使用することで、より良い結果が得られる可能性があります。
+```bash
+--video_sections 1 --output_type latent_images --image_path start_image.png --control_image_path start_image.png clean_latent_post_image.png \
+--one_frame_inference target_index=1,control_index=0;10,no_post,no_2x,no_4x --control_image_mask_path ctrl_mask1.png ctrl_mask2.png
+```
+`start_image.png`（clean_latent_preに相当）と`clean_latent_post_image.png`は参照画像（clean_latent_postに相当）です。`target_index`は生成する画像のインデックスを指定します。`control_index`はそれぞれの制御用画像のclean latent indexを指定しますので、`0;10` になります。また`--control_image_mask_path`に制御用画像に適用するマスクを指定します。
+`target_index`、`control_index`の最適値は不明です。`target_index`は1以上を指定してください。`control_index`は`latent_window_size`に対して適切な値を指定してください。`target_index`に1を指定すると開始画像からの変化が少なくなりますが、ノイズが乗ったりすることが多いようです。9や13などを指定するとノイズは改善されるかもしれませんが、元の画像からの変化が大きくなります。
+`control_index`は`target_index`より大きい値を指定してください。通常は`10`ですが、これ以上大きな値、たとえば`13~16程度でも動作するようです。
+サンプル画像と再現のためのコマンドラインは以下のようになります。
+```bash
+python fpack_generate_video.py --video_size 832 480 --video_sections 1 --infer_steps 25 \
+    --prompt "The girl in a school blazer in a classroom." --save_path path/to/output --output_type latent_images \
+    --dit path/to/dit --vae path/to/vae --text_encoder1 path/to/text_encoder1 --text_encoder2 path/to/text_encoder2 \
+    --image_encoder path/to/image_encoder --attn_mode sdpa --vae_spatial_tile_sample_min_size 128 --vae_chunk_size 32 \
+    --image_path path/to/kisekaeichi_start.png --control_image_path path/to/kisekaeichi_start.png path/to/kisekaeichi_ref.png
+    --one_frame_inference target_index=1,control_index=0;10,no_2x,no_4x,no_post
+    --control_image_mask_path path/to/kisekaeichi_start_mask.png path/to/kisekaeichi_ref_mask.png --seed 1234
+```
+VRAM容量に応じて、`--fp8_scaled`や`--blocks_to_swap`等のオプションを調整してください。
+- [kisekaeichi_start.png](./kisekaeichi_start.png)
+- [kisekaeichi_ref.png](./kisekaeichi_ref.png)
+- [kisekaeichi_start_mask.png](./kisekaeichi_start_mask.png)
+- [kisekaeichi_ref_mask.png](./kisekaeichi_ref_mask.png)
+生成結果:
+- [kisekaeichi_result.png](./kisekaeichi_result.png)
+**1f-mcの指定例**:
+```bash
+--video_sections 1 --output_type latent_images --image_path start_image.png --control_image_path start_image.png 2nd_image.png \
+--one_frame_inference target_index=9,control_index=0;1,no_2x,no_4x
+```
+この例では、`start_image.png`が開始画像で、`2nd_image.png`が参照画像です。`target_index=9`は生成する画像のインデックスを指定し、`control_index=0;1`はそれぞれの制御用画像のclean latent indexを指定しています。
+1f-mcは学習したLoRAと組み合わせることを想定していますので、そのLoRAの説明に従って、`target_index`や`control_index`を調整してください。
+</details>

docs/kisekaeichi_ref.png ADDED Viewed

Git LFS Details

SHA256: e5037f0a0cfb1a6b0a8d1f19fb462df75fb53384d0d9e654c359ca984fafa605
Pointer size: 131 Bytes
Size of remote file: 584 kB

docs/kisekaeichi_ref_mask.png ADDED Viewed

docs/kisekaeichi_result.png ADDED Viewed

Git LFS Details

SHA256: 223dacb98ac834a442ee124641a6b852b1cde3bc1f11939e78192fc8be2f7b49
Pointer size: 131 Bytes
Size of remote file: 408 kB

docs/kisekaeichi_start.png ADDED Viewed

Git LFS Details

SHA256: beee4a910402ef2798b00aa4d193b0b7186380ed24928a4d39acc8635d2cfdaf
Pointer size: 132 Bytes
Size of remote file: 1.03 MB

docs/kisekaeichi_start_mask.png ADDED Viewed

docs/sampling_during_training.md CHANGED Viewed

@@ -72,16 +72,20 @@ A line starting with `#` is a comment.
 * `--f` specifies the number of frames. The default is 1, which generates a still image.
 * `--d` specifies the seed. The default is random.
 * `--s` specifies the number of steps in generation. The default is 20.
-* `--g` specifies the guidance scale. The default is 6.0, which is the default value during inference of HunyuanVideo. Specify 1.0 for SkyReels V1 models. Ignore this option for Wan2.1 models.
-* `--fs` specifies the discrete flow shift. The default is 14.5, which corresponds to the number of steps 20. In the HunyuanVideo paper, 7.0 is recommended for 50 steps, and 17.0 is recommended for less than 20 steps (e.g. 10).
-If you train I2V models, you can use the additional options below.
 * `--i path/to/image.png`: the image path for image2video inference.
-If you train the model with classifier free guidance, you can use the additional options below.
-*`--n negative prompt...`: the negative prompt for the classifier free guidance.
 *`--l 6.0`: the classifier free guidance scale. Should be set to 6.0 for SkyReels V1 models. 5.0 is the default value for Wan2.1 (if omitted).
 <details>
@@ -94,15 +98,19 @@ If you train the model with classifier free guidance, you can use the additional
 * `--f` フレーム数を指定します。省略時は1で、静止画を生成します。
 * `--d` シードを指定します。省略時はランダムです。
 * `--s` 生成におけるステップ数を指定します。省略時は20です。
-* `--g` guidance scaleを指定します。省略時は6.0で、HunyuanVideoの推論時のデフォルト値です。
-* `--fs` discrete flow shiftを指定します。省略時は14.5で、ステップ数20の場合に対応した値です。HunyuanVideoの論文では、ステップ数50の場合は7.0、ステップ数20未満（10など）で17.0が推奨されています。
-I2Vモデルを学習する場合、以下の追加オプションを使用できます。
 * `--i path/to/image.png`: image2video推論用の画像パス。
-classifier free guidance（ネガティブプロンプト）を必要とするモデルを学習する場合、以下の追加オプションを使用できます。
-*`--n negative prompt...`: classifier free guidance用のネガティブプロンプト。
 *`--l 6.0`: classifier free guidance scale。SkyReels V1モデルの場合は6.0に設定してください。Wan2.1の場合はデフォルト値が5.0です（省略時）。
 </details>

 * `--f` specifies the number of frames. The default is 1, which generates a still image.
 * `--d` specifies the seed. The default is random.
 * `--s` specifies the number of steps in generation. The default is 20.
+* `--g` specifies the embedded guidance scale (not CFG scale). The default is 6.0 for HunyuanVideo, 10.0 for FramePack, which is the default value during inference of each architecture. Specify 1.0 for SkyReels V1 models. Ignore this option for Wan2.1 models.
+* `--fs` specifies the discrete flow shift. The default is 14.5, which corresponds to the number of steps 20. In the HunyuanVideo paper, 7.0 is recommended for 50 steps, and 17.0 is recommended for less than 20 steps (e.g. 10). Ignore this option for FramePack models (it uses 10.0).
+If you train I2V models, you must add the following option.
 * `--i path/to/image.png`: the image path for image2video inference.
+If you train Wan2.1-Fun-Control models, you must add the following option.
+* `--cn path/to/control_video_or_dir_of_images`: the path to the video or directory containing multiple images for control.
+If you train the model with classifier free guidance (such as Wan2.1), you can use the additional options below.
+*`--n negative prompt...`: the negative prompt for the classifier free guidance. The default prompt for each model is used if omitted.
 *`--l 6.0`: the classifier free guidance scale. Should be set to 6.0 for SkyReels V1 models. 5.0 is the default value for Wan2.1 (if omitted).
 <details>
 * `--f` フレーム数を指定します。省略時は1で、静止画を生成します。
 * `--d` シードを指定します。省略時はランダムです。
 * `--s` 生成におけるステップ数を指定します。省略時は20です。
+* `--g` embedded guidance scaleを指定します（CFG scaleではありません）。省略時はHunyuanVideoは6.0、FramePackは10.0で、各アーキテクチャの推論時のデフォルト値です。SkyReels V1モデルの場合は1.0を指定してください。Wan2.1モデルの場合はこのオプションは無視されます。
+* `--fs` discrete flow shiftを指定します。省略時は14.5で、ステップ数20の場合に対応した値です。HunyuanVideoの論文では、ステップ数50の場合は7.0、ステップ数20未満（10など）で17.0が推奨されています。FramePackモデルはこのオプションは無視され、10.0が使用されます。
+I2Vモデルを学習する場合、以下のオプションを追加してください。
 * `--i path/to/image.png`: image2video推論用の画像パス。
+Wan2.1-Fun-Controlモデルを学習する場合、以下のオプションを追加してください。
+* `--cn path/to/control_video_or_dir_of_images`: control用の動画または複数枚の画像を含むディレクトリのパス。
+classifier free guidance（ネガティブプロンプト）を必要とするモデル（Wan2.1など）を学習する場合、以下の追加オプションを使用できます。
+*`--n negative prompt...`: classifier free guidance用のネガティブプロンプト。省略時はモデルごとのデフォルトプロンプトが使用されます。
 *`--l 6.0`: classifier free guidance scale。SkyReels V1モデルの場合は6.0に設定してください。Wan2.1の場合はデフォルト値が5.0です（省略時）。
 </details>

docs/wan.md CHANGED Viewed

@@ -27,24 +27,45 @@ This feature is experimental.
 ## Download the model / モデルのダウンロード
-Download the T5 `models_t5_umt5-xxl-enc-bf16.pth` and CLIP `models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` from the following page: https://huggingface.co/Wan-AI/Wan2.1-T2V-14B/tree/main
 Download the VAE from the above page `Wan2.1_VAE.pth` or download `split_files/vae/wan_2.1_vae.safetensors` from the following page: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/vae
 Download the DiT weights from the following page: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
-Please select the appropriate weights according to T2V, I2V, resolution, model size, etc. fp8 models can be used if `--fp8` is specified.
 (Thanks to Comfy-Org for providing the repackaged weights.)
 <details>
 <summary>日本語</summary>
-T5 `models_t5_umt5-xxl-enc-bf16.pth` およびCLIP `models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を、次のページからダウンロードしてください：https://huggingface.co/Wan-AI/Wan2.1-T2V-14B/tree/main
 VAEは上のページから `Wan2.1_VAE.pth` をダウンロードするか、次のページから `split_files/vae/wan_2.1_vae.safetensors` をダウンロードしてください：https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/vae
 DiTの重みを次のページからダウンロードしてください：https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
-T2VやI2V、解像度、モデルサイズなどにより適切な重みを選択してください。`--fp8`指定時はfp8モデルも使用できます。
 （repackaged版の重みを提供してくださっているComfy-Orgに感謝いたします。）
 </details>
@@ -63,6 +84,8 @@ If you train I2V models, add `--clip path/to/models_clip_open-clip-xlm-roberta-l
 If you're running low on VRAM, specify `--vae_cache_cpu` to use the CPU for the VAE internal cache, which will reduce VRAM usage somewhat.
 <details>
 <summary>日本語</summary>
 latentの事前キャッシングはHunyuanVideoとほぼ同じです。上のコマンド例を使用してキャッシュを作成してください。
@@ -70,6 +93,8 @@ latentの事前キャッシングはHunyuanVideoとほぼ同じです。上の
 I2Vモデルを学習する場合は、`--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を追加してCLIPモデルを指定してください。指定しないと学習時にエラーが発生します。
 VRAMが不足している場合は、`--vae_cache_cpu` を指定するとVAEの内部キャッシュにCPUを使うことで、使用VRAMを多少削減できます。
 </details>
 ### Text Encoder Output Pre-caching
@@ -115,7 +140,7 @@ The above is an example. The appropriate values for `timestep_sampling` and `dis
 For additional options, use `python wan_train_network.py --help` (note that many options are unverified).
-`--task` is one of `t2v-1.3B`, `t2v-14B`, `i2v-14B` and `t2i-14B`. Specify the DiT weights for the task with `--dit`.
 Don't forget to specify `--network_module networks.lora_wan`.
@@ -129,7 +154,7 @@ Use `convert_lora.py` for converting the LoRA weights after training, as in Huny
 その他のオプションについては `python wan_train_network.py --help` を使用してください（多くのオプションは未検証です）。
-`--task` には `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` のいずれかを指定します。`--dit`に、taskに応じたDiTの重みを指定してください。
  `--network_module` に `networks.lora_wan` を指定することを忘れないでください。
@@ -152,7 +177,7 @@ Each option is the same as when generating images or as HunyuanVideo. Please ref
 If you train I2V models, add `--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` to specify the CLIP model.
-You can specify the initial image and negative prompts in the prompt file. Please refer to [here](/docs/sampling_during_training.md#prompt-file--プロンプトファイル).
 <details>
 <summary>日本語</summary>
@@ -160,12 +185,23 @@ You can specify the initial image and negative prompts in the prompt file. Pleas
 I2Vモデルを学習する場合は、`--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を追加してCLIPモデルを指定してください。
-プロンプトファイルで、初期画像やネガティブプロンプト等を指定できます。[こちら](/docs/sampling_during_training.md#prompt-file--プロンプトファイル)を参照してください。
 </details>
 ## Inference / 推論
 ### T2V Inference / T2V推論
 The following is an example of T2V inference (input as a single line):
@@ -178,30 +214,64 @@ python wan_generate_video.py --fp8 --task t2v-1.3B --video_size  832 480 --video
 --attn_mode torch
 ```
-`--task` is one of `t2v-1.3B`, `t2v-14B`, `i2v-14B` and `t2i-14B`.
 `--attn_mode` is `torch`, `sdpa` (same as `torch`), `xformers`, `sageattn`,`flash2`, `flash` (same as `flash2`) or `flash3`. `torch` is the default. Other options require the corresponding library to be installed. `flash3` (Flash attention 3) is not tested.
 `--fp8_t5` can be used to specify the T5 model in fp8 format. This option reduces memory usage for the T5 model.
 `--negative_prompt` can be used to specify a negative prompt. If omitted, the default negative prompt is used.
-` --flow_shift` can be used to specify the flow shift (default 3.0 for I2V with 480p, 5.0 for others).
-`--guidance_scale` can be used to specify the guidance scale for classifier free guiance (default 5.0).
 `--blocks_to_swap` is the number of blocks to swap during inference. The default value is None (no block swap). The maximum value is 39 for 14B model and 29 for 1.3B model.
 `--vae_cache_cpu` enables VAE cache in main memory. This reduces VRAM usage slightly but processing is slower.
 Other options are same as `hv_generate_video.py` (some options are not supported, please check the help).
 <details>
 <summary>日本語</summary>
-`--task` には `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` のいずれかを指定します。
 `--attn_mode` には `torch`, `sdpa`（`torch`と同じ）、`xformers`, `sageattn`, `flash2`, `flash`（`flash2`と同じ）, `flash3` のいずれかを指定します。デフォルトは `torch` です。その他のオプションを使用する場合は、対応するライブラリをインストールする必要があります。`flash3`（Flash attention 3）は未テストです。
 `--fp8_t5` を指定するとT5モデルをfp8形式で実行します。T5モデル呼び出し時のメモリ使用量を削減します。
 `--negative_prompt` でネガティブプロンプトを指定できます。省略した場合はデフォルトのネガティブプロンプトが使用されます。
@@ -214,9 +284,116 @@ Other options are same as `hv_generate_video.py` (some options are not supported
 `--vae_cache_cpu` を有効にすると、VAEのキャッシュをメインメモリに保持します。VRAM使用量が多少減りますが、処理は遅くなります。
 その他のオプションは `hv_generate_video.py` と同じです（一部のオプションはサポートされていないため、ヘルプを確認してください）。
 </details>
 ### I2V Inference / I2V推論
 The following is an example of I2V inference (input as a single line):
@@ -231,11 +408,124 @@ python wan_generate_video.py --fp8 --task i2v-14B --video_size 832 480 --video_l
 Add `--clip` to specify the CLIP model. `--image_path` is the path to the image to be used as the initial frame.
 Other options are same as T2V inference.
 <details>
 <summary>日本語</summary>
 `--clip` を追加してCLIPモデルを指定します。`--image_path` は初期フレームとして使用する画像のパスです。
 その他のオプションはT2V推論と同じです。
 </details>

 ## Download the model / モデルのダウンロード
+Download the T5 `models_t5_umt5-xxl-enc-bf16.pth` and CLIP `models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` from the following page: https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P/tree/main
 Download the VAE from the above page `Wan2.1_VAE.pth` or download `split_files/vae/wan_2.1_vae.safetensors` from the following page: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/vae
 Download the DiT weights from the following page: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
+Wan2.1 Fun Control model weights can be downloaded from [here](https://huggingface.co/alibaba-pai/Wan2.1-Fun-14B-Control). Navigate to each weight page and download. The Fun Control model seems to support not only T2V but also I2V tasks.
+Please select the appropriate weights according to T2V, I2V, resolution, model size, etc.
+`fp16` and `bf16` models can be used, and `fp8_e4m3fn` models can be used if `--fp8` (or `--fp8_base`) is specified without specifying `--fp8_scaled`. **Please note that `fp8_scaled` models are not supported even with `--fp8_scaled`.**
 (Thanks to Comfy-Org for providing the repackaged weights.)
+### Model support matrix / モデルサポートマトリックス
+* columns: training dtype (行：学習時のデータ型)
+* rows: model dtype (列：モデルのデータ型)
+| model \ training |bf16|fp16|--fp8_base|--fp8base & --fp8_scaled|
+|--|--|--|--|--|
+|bf16|✓|--|✓|✓|
+|fp16|--|✓|✓|✓|
+|fp8_e4m3fn|--|--|✓|--|
+|fp8_scaled|--|--|--|--|
 <details>
 <summary>日本語</summary>
+T5 `models_t5_umt5-xxl-enc-bf16.pth` およびCLIP `models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を、次のページからダウンロードしてください：https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P/tree/main
 VAEは上のページから `Wan2.1_VAE.pth` をダウンロードするか、次のページから `split_files/vae/wan_2.1_vae.safetensors` をダウンロードしてください：https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/vae
 DiTの重みを次のページからダウンロードしてください：https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
+Wan2.1 Fun Controlモデルの重みは、[こちら](https://huggingface.co/alibaba-pai/Wan2.1-Fun-14B-Control)から、それぞれの重みのページに遷移し、ダウンロードしてください。Fun ControlモデルはT2VだけでなくI2Vタスクにも対応しているようです。
+T2VやI2V、解像度、モデルサイズなどにより適切な重みを選択してください。
+`fp16` および `bf16` モデルを使用できます。また、`--fp8` （または`--fp8_base`）を指定し`--fp8_scaled`を指定をしないときには `fp8_e4m3fn` モデルを使用できます。**`fp8_scaled` モデルはいずれの場合もサポートされていませんのでご注意ください。**
 （repackaged版の重みを提供してくださっているComfy-Orgに感謝いたします。）
 </details>
 If you're running low on VRAM, specify `--vae_cache_cpu` to use the CPU for the VAE internal cache, which will reduce VRAM usage somewhat.
+The control video settings are required for training the Fun-Control model. Please refer to [Dataset Settings](/dataset/dataset_config.md#sample-for-video-dataset-with-control-images) for details.
 <details>
 <summary>日本語</summary>
 latentの事前キャッシングはHunyuanVideoとほぼ同じです。上のコマンド例を使用してキャッシュを作成してください。
 I2Vモデルを学習する場合は、`--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を追加してCLIPモデルを指定してください。指定しないと学習時にエラーが発生します。
 VRAMが不足している場合は、`--vae_cache_cpu` を指定するとVAEの内部キャッシュにCPUを使うことで、使用VRAMを多少削減できます。
+Fun-Controlモデルを学習する場合は、制御用動画の設定が必要です。[データセット設定](/dataset/dataset_config.md#sample-for-video-dataset-with-control-images)を参照してください。
 </details>
 ### Text Encoder Output Pre-caching
 For additional options, use `python wan_train_network.py --help` (note that many options are unverified).
+`--task` is one of `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` (for Wan2.1 official models), `t2v-1.3B-FC`, `t2v-14B-FC`, and `i2v-14B-FC` (for Wan2.1 Fun Control model). Specify the DiT weights for the task with `--dit`.
 Don't forget to specify `--network_module networks.lora_wan`.
 その他のオプションについては `python wan_train_network.py --help` を使用してください（多くのオプションは未検証です）。
+`--task` には `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` （これらはWan2.1公式モデル）、`t2v-1.3B-FC`, `t2v-14B-FC`, `i2v-14B-FC`（Wan2.1-Fun Controlモデル）を指定します。`--dit`に、taskに応じたDiTの重みを指定してください。
  `--network_module` に `networks.lora_wan` を指定することを忘れないでください。
 If you train I2V models, add `--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` to specify the CLIP model.
+You can specify the initial image, the negative prompt and the control video (for Wan2.1-Fun-Control) in the prompt file. Please refer to [here](/docs/sampling_during_training.md#prompt-file--プロンプトファイル).
 <details>
 <summary>日本語</summary>
 I2Vモデルを学習する場合は、`--clip path/to/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth` を追加してCLIPモデルを指定してください。
+プロンプトファイルで、初期画像やネガティブプロンプト、制御動画（Wan2.1-Fun-Control用）等を指定できます。[こちら](/docs/sampling_during_training.md#prompt-file--プロンプトファイル)を参照してください。
 </details>
 ## Inference / 推論
+### Inference Options Comparison / 推論オプション比較
+#### Speed Comparison (Faster → Slower) / 速度比較（速い→遅い）
+*Note: Results may vary depending on GPU type*
+fp8_fast > bf16/fp16 (no block swap) > fp8 > fp8_scaled > bf16/fp16 (block swap)
+#### Quality Comparison (Higher → Lower) / 品質比較（高→低）
+bf16/fp16 > fp8_scaled > fp8 >> fp8_fast
 ### T2V Inference / T2V推論
 The following is an example of T2V inference (input as a single line):
 --attn_mode torch
 ```
+`--task` is one of `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` (these are Wan2.1 official models), `t2v-1.3B-FC`, `t2v-14B-FC` and `i2v-14B-FC` (for Wan2.1-Fun Control model).
 `--attn_mode` is `torch`, `sdpa` (same as `torch`), `xformers`, `sageattn`,`flash2`, `flash` (same as `flash2`) or `flash3`. `torch` is the default. Other options require the corresponding library to be installed. `flash3` (Flash attention 3) is not tested.
+Specifying `--fp8` runs DiT in fp8 mode. fp8 can significantly reduce memory consumption but may impact output quality.
+`--fp8_scaled` can be specified in addition to `--fp8` to run the model in fp8 weights optimization. This increases memory consumption and speed slightly but improves output quality. See [here](advanced_config.md#fp8-weight-optimization-for-models--モデルの重みのfp8への最適化) for details.
+`--fp8_fast` option is also available for faster inference on RTX 40x0 GPUs. This option requires `--fp8_scaled` option. **This option seems to degrade the output quality.**
 `--fp8_t5` can be used to specify the T5 model in fp8 format. This option reduces memory usage for the T5 model.
 `--negative_prompt` can be used to specify a negative prompt. If omitted, the default negative prompt is used.
+`--flow_shift` can be used to specify the flow shift (default 3.0 for I2V with 480p, 5.0 for others).
+`--guidance_scale` can be used to specify the guidance scale for classifier free guidance (default 5.0).
 `--blocks_to_swap` is the number of blocks to swap during inference. The default value is None (no block swap). The maximum value is 39 for 14B model and 29 for 1.3B model.
 `--vae_cache_cpu` enables VAE cache in main memory. This reduces VRAM usage slightly but processing is slower.
+`--compile` enables torch.compile. See [here](/README.md#inference) for details.
+`--trim_tail_frames` can be used to trim the tail frames when saving. The default is 0.
+`--cfg_skip_mode` specifies the mode for skipping CFG in different steps. The default is `none` (all steps).`--cfg_apply_ratio` specifies the ratio of steps where CFG is applied. See below for details.
+`--include_patterns` and `--exclude_patterns` can be used to specify which LoRA modules to apply or exclude during training. If not specified, all modules are applied by default. These options accept regular expressions.
+`--include_patterns` specifies the modules to be applied, and `--exclude_patterns` specifies the modules to be excluded. The regular expression is matched against the LoRA key name, and include takes precedence.
+The key name to be searched is in sd-scripts format (`lora_unet_<module_name with dot replaced by _>`). For example, `lora_unet_blocks_9_cross_attn_k`.
+For example, if you specify `--exclude_patterns "blocks_[23]\d_"`, it will exclude modules containing `blocks_20` to `blocks_39`. If you specify `--include_patterns "cross_attn" --exclude_patterns "blocks_(0|1|2|3|4)_"`, it will apply LoRA to modules containing `cross_attn` and not containing `blocks_0` to `blocks_4`.
+If you specify multiple LoRA weights, please specify them with multiple arguments. For example: `--include_patterns "cross_attn" ".*" --exclude_patterns "dummy_do_not_exclude" "blocks_(0|1|2|3|4)"`. `".*"` is a regex that matches everything. `dummy_do_not_exclude` is a dummy regex that does not match anything.
+`--cpu_noise` generates initial noise on the CPU. This may result in the same results as ComfyUI with the same seed (depending on other settings).
+If you are using the Fun Control model, specify the control video with `--control_path`. You can specify a video file or a folder containing multiple image files. The number of frames in the video file (or the number of images) should be at least the number specified in `--video_length` (plus 1 frame if you specify `--end_image_path`).
+Please try to match the aspect ratio of the control video with the aspect ratio specified in `--video_size` (there may be some deviation from the initial image of I2V due to the use of bucketing processing).
 Other options are same as `hv_generate_video.py` (some options are not supported, please check the help).
 <details>
 <summary>日本語</summary>
+`--task` には `t2v-1.3B`, `t2v-14B`, `i2v-14B`, `t2i-14B` （これらはWan2.1公式モデル）、`t2v-1.3B-FC`, `t2v-14B-FC`, `i2v-14B-FC`（Wan2.1-Fun Controlモデル）を指定します。
 `--attn_mode` には `torch`, `sdpa`（`torch`と同じ）、`xformers`, `sageattn`, `flash2`, `flash`（`flash2`と同じ）, `flash3` のいずれかを指定します。デフォルトは `torch` です。その他のオプションを使用する場合は、対応するライブラリをインストールする必要があります。`flash3`（Flash attention 3）は未テストです。
+`--fp8` を指定するとDiTモデルをfp8形式で実行します。fp8はメモリ消費を大幅に削減できますが、出力品質に影響を与える可能性があります。
+`--fp8_scaled` を `--fp8` と併用すると、fp8への重み量子化を行います。メモリ消費と速度はわずかに悪化しますが、出力品質が向上します。詳しくは[こちら](advanced_config.md#fp8-weight-optimization-for-models--モデルの重みのfp8への最適化)を参照してください。
+`--fp8_fast` オプションはRTX 40x0 GPUでの高速推論に使用されるオプションです。このオプションは `--fp8_scaled` オプションが必要です。**出力品質が劣化するようです。**
 `--fp8_t5` を指定するとT5モデルをfp8形式で実行します。T5モデル呼び出し時のメモリ使用量を削減します。
 `--negative_prompt` でネガティブプロンプトを指定できます。省略した場合はデフォルトのネガティブプロンプトが使用されます。
 `--vae_cache_cpu` を有効にすると、VAEのキャッシュをメインメモリに保持します。VRAM使用量が多少減りますが、処理は遅くなります。
+`--compile`でtorch.compileを有効にします。詳細については[こちら](/README.md#inference)を参照してください。
+`--trim_tail_frames` で保存時に末尾のフレームをトリミングできます。デフォルトは0です。
+`--cfg_skip_mode` は異なるステップでCFGをスキップするモードを指定します。デフォルトは `none`（全ステップ）。`--cfg_apply_ratio` はCFGが適用されるステップの割合を指定します。詳細は後述します。
+LoRAのどのモジュールを適用するかを、`--include_patterns`と`--exclude_patterns`で指定できます（未指定時・デフォルトは全モジュール適用されます
+）。これらのオプションには、正規表現を指定します。`--include_patterns`は適用するモジュール、`--exclude_patterns`は適用しないモジュールを指定します。正規表現がLoRAのキー名に含まれるかどうかで判断され、includeが優先されます。
+検索対象となるキー名は sd-scripts 形式（`lora_unet_<モジュール名のドットを_に置換したもの>`）です。例：`lora_unet_blocks_9_cross_attn_k`
+たとえば `--exclude_patterns "blocks_[23]\d_"`のみを指定すると、`blocks_20`から`blocks_39`を含むモジュールが除外されます。`--include_patterns "cross_attn" --exclude_patterns "blocks_(0|1|2|3|4)_"`のようにincludeとexcludeを指定すると、`cross_attn`を含むモジュールで、かつ`blocks_0`から`blocks_4`を含まないモジュールにLoRAが適用されます。
+複数のLoRAの重みを指定する場合は、複数個の引数で指定してください。例：`--include_patterns "cross_attn" ".*" --exclude_patterns "dummy_do_not_exclude" "blocks_(0|1|2|3|4)"` `".*"`は全てにマッチする正規表現です。`dummy_do_not_exclude`は何にもマッチしないダミーの正規表現です。
+`--cpu_noise`を指定すると初期ノイズをCPUで生成します。これにより同一seed時の結果がComfyUIと同じになる可能性があります（他の設定にもよります）。
+Fun Controlモデルを使用する場合は、`--control_path`で制御用の映像を指定します。動画ファイル、または複数枚の画像ファイルを含んだフォルダを指定できます。動画ファイルのフレーム数（または画像の枚数）は、`--video_length`で指定したフレーム数以上にしてください（後述の`--end_image_path`を指定した場合は、さらに+1フレーム）。
+制御用の映像のアスペクト比は、`--video_size`で指定したアスペクト比とできるかぎり合わせてください（bucketingの処理を流用しているためI2Vの初期画像とズレる場合があります）。
 その他のオプションは `hv_generate_video.py` と同じです（一部のオプションはサポートされていないため、ヘルプを確認してください）。
 </details>
+#### CFG Skip Mode / CFGスキップモード
+ These options allow you to balance generation speed against prompt accuracy. More skipped steps results in faster generation with potential quality degradation.
+Setting `--cfg_apply_ratio` to 0.5 speeds up the denoising loop by up to 25%.
+`--cfg_skip_mode` specified one of the following modes:
+- `early`: Skips CFG in early steps for faster generation, applying guidance mainly in later refinement steps
+- `late`: Skips CFG in later steps, applying guidance during initial structure formation
+- `middle`: Skips CFG in middle steps, applying guidance in both early and later steps
+- `early_late`: Skips CFG in both early and late steps, applying only in middle steps
+- `alternate`: Applies CFG in alternate steps based on the specified ratio
+- `none`: Applies CFG at all steps (default)
+`--cfg_apply_ratio` specifies a value from 0.0 to 1.0 controlling the proportion of steps where CFG is applied. For example, setting 0.5 means CFG will be applied in only 50% of the steps.
+If num_steps is 10, the following table shows the steps where CFG is applied based on the `--cfg_skip_mode` option (A means CFG is applied, S means it is skipped, `--cfg_apply_ratio` is 0.6):
+| skip mode | CFG apply pattern |
+|---|---|
+| early | SSSSAAAAAA |
+| late | AAAAAASSSS |
+| middle | AAASSSSAAA |
+| early_late | SSAAAAAASS |
+| alternate | SASASAASAS |
+The appropriate settings are unknown, but you may want to try `late` or `early_late` mode with a ratio of around 0.3 to 0.5.
+<details>
+<summary>日本語</summary>
+これらのオプションは、生成速度とプロンプトの精度のバランスを取ることができます。スキップされるステップが多いほど、生成速度が速くなりますが、品質が低下する可能性があります。
+ratioに0.5を指定���ることで、デノイジングのループが最大25%程度、高速化されます。
+`--cfg_skip_mode` は次のモードのいずれかを指定します：
+- `early`：初期のステップでCFGをスキップして、主に終盤の精細化のステップで適用します
+- `late`：終盤のステップでCFGをスキップし、初期の構造が決まる段階で適用します
+- `middle`：中間のステップでCFGをスキップし、初期と終盤のステップの両方で適用します
+- `early_late`：初期と終盤のステップの両方でCFGをスキップし、中間のステップのみ適用します
+- `alternate`：指定された割合に基づいてCFGを適用します
+`--cfg_apply_ratio` は、CFGが適用されるステップの割合を0.0から1.0の値で指定します。たとえば、0.5に設定すると、CFGはステップの50%のみで適用されます。
+具体的なパターンは上のテーブルを参照してください。
+適切な設定は不明ですが、モードは`late`または`early_late`、ratioは0.3~0.5程度から試してみると良いかもしれません。
+</details>
+#### Skip Layer Guidance
+Skip Layer Guidance is a feature that uses the output of a model with some blocks skipped as the unconditional output of classifier free guidance. It was originally proposed in [SD 3.5](https://github.com/comfyanonymous/ComfyUI/pull/5404) and first applied in Wan2GP in [this PR](https://github.com/deepbeepmeep/Wan2GP/pull/61). It may improve the quality of generated videos.
+The implementation of SD 3.5 is [here](https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py), and the implementation of Wan2GP (the PR mentioned above) has some different specifications. This inference script allows you to choose between the two methods.
+*The SD3.5 method applies slg output in addition to cond and uncond (slows down the speed). The Wan2GP method uses only cond and slg output.*
+The following arguments are available:
+- `--slg_mode`: Specifies the SLG mode. `original` for SD 3.5 method, `uncond` for Wan2GP method. Default is None (no SLG).
+- `--slg_layers`: Specifies the indices of the blocks (layers) to skip in SLG, separated by commas. Example: `--slg_layers 4,5,6`. Default is empty (no skip). If this option is not specified, `--slg_mode` is ignored.
+- `--slg_scale`: Specifies the scale of SLG when `original`. Default is 3.0.
+- `--slg_start`: Specifies the start step of SLG application in inference steps from 0.0 to 1.0. Default is 0.0 (applied from the beginning).
+- `--slg_end`: Specifies the end step of SLG application in inference steps from 0.0 to 1.0. Default is 0.3 (applied up to 30% from the beginning).
+Appropriate settings are unknown, but you may want to try `original` mode with a scale of around 3.0 and a start ratio of 0.0 and an end ratio of 0.5, with layers 4, 5, and 6 skipped.
+<details>
+<summary>日本語</summary>
+Skip Layer Guidanceは、一部のblockをスキップしたモデル出力をclassifier free guidanceのunconditional出力に使用する機能です。元々は[SD 3.5](https://github.com/comfyanonymous/ComfyUI/pull/5404)で提案されたもので、Wan2.1には[Wan2GPのこちらのPR](https://github.com/deepbeepmeep/Wan2GP/pull/61)で初めて適用されました。生成動画の品質が向上する可能性があります。
+SD 3.5の実装は[こちら](https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py)で、Wan2GPの実装（前述のPR）は一部仕様が異なります。この推論スクリプトでは両者の方式を選択できるようになっています。
+※SD3.5方式はcondとuncondに加えてslg outputを適用します（速度が低下します）。Wan2GP方式はcondとslg outputのみを使用します。
+以下の引数があります。
+- `--slg_mode`：SLGのモードを指定します。`original`でSD 3.5の方式、`uncond`でWan2GPの方式です。デフォルトはNoneで、SLGを使用しません。
+- `--slg_layers`：SLGでスキップするblock (layer)のインデクスをカンマ区切りで指定します。例：`--slg_layers 4,5,6`。デフォルトは空（スキップしない）です。このオプションを指定しないと`--slg_mode`は無視されます。
+- `--slg_scale`：`original`のときのSLGのスケールを指定します。デフォルトは3.0です。
+- `--slg_start`：推論ステップのSLG適用開始ステップを0.0から1.0の割合で指定します。デフォルトは0.0です（最初から適用）。
+- `--slg_end`：推論ステップのSLG適用終了ステップを0.0から1.0の割合で指定します。デフォルトは0.3です（最初から30%まで適用）。
+適切な設定は不明ですが、`original`モードでスケールを3.0程度、開始割合を0.0、終了割合を0.5程度に設定し、4, 5, 6のlayerをスキップする設定から始めると良いかもしれません。
+</details>
 ### I2V Inference / I2V推論
 The following is an example of I2V inference (input as a single line):
 Add `--clip` to specify the CLIP model. `--image_path` is the path to the image to be used as the initial frame.
+`--end_image_path` can be used to specify the end image. This option is experimental. When this option is specified, the saved video will be slightly longer than the specified number of frames and will have noise, so it is recommended to specify `--trim_tail_frames 3` to trim the tail frames.
+You can also use the Fun Control model for I2V inference. Specify the control video with `--control_path`.
 Other options are same as T2V inference.
 <details>
 <summary>日本語</summary>
 `--clip` を追加してCLIPモデルを指定します。`--image_path` は初期フレームとして使用する画像のパスです。
+`--end_image_path` で終了画像を指定できます。このオプションは実験的なものです。このオプションを指定すると、保存される動画が指定フレーム数よりもやや多くなり、かつノイズが乗るため、`--trim_tail_frames 3` などを指定して末尾のフレームをトリミングすることをお勧めします。
+I2V推論でもFun Controlモデルが使用できます。`--control_path` で制御用の映像を指定します。
 その他のオプションはT2V推論と同じです。
 </details>
+### New Batch and Interactive Modes / 新しいバッチモードとインタラクティブモード
+In addition to single video generation, Wan 2.1 now supports batch generation from file and interactive prompt input:
+#### Batch Mode from File / ファイルからのバッチモード
+Generate multiple videos from prompts stored in a text file:
+```bash
+python wan_generate_video.py --from_file prompts.txt --task t2v-14B
+--dit path/to/model.safetensors --vae path/to/vae.safetensors
+--t5 path/to/t5_model.pth --save_path output_directory
+```
+The prompts file format:
+- One prompt per line
+- Empty lines and lines starting with # are ignored (comments)
+- Each line can include prompt-specific parameters using command-line style format:
+```
+A beautiful sunset over mountains --w 832 --h 480 --f 81 --d 42 --s 20
+A busy city street at night --w 480 --h 832 --g 7.5 --n low quality, blurry
+```
+Supported inline parameters (if ommitted, default values from the command line are used):
+- `--w`: Width
+- `--h`: Height
+- `--f`: Frame count
+- `--d`: Seed
+- `--s`: Inference steps
+- `--g` or `--l`: Guidance scale
+- `--fs`: Flow shift
+- `--i`: Image path (for I2V)
+- `--cn`: Control path (for Fun Control)
+- `--n`: Negative prompt
+In batch mode, models are loaded once and reused for all prompts, significantly improving overall generation time compared to multiple single runs.
+#### Interactive Mode / インタラクティブモード
+Interactive command-line interface for entering prompts:
+```bash
+python wan_generate_video.py --interactive --task t2v-14B
+--dit path/to/model.safetensors --vae path/to/vae.safetensors
+--t5 path/to/t5_model.pth --save_path output_directory
+```
+In interactive mode:
+- Enter prompts directly at the command line
+- Use the same inline parameter format as batch mode
+- Use Ctrl+D (or Ctrl+Z on Windows) to exit
+- Models remain loaded between generations for efficiency
+<details>
+<summary>日本語</summary>
+単一動画の生成に加えて、Wan 2.1は現在、ファイルからのバッチ生成とインタラクティブなプロンプト入力をサポートしています。
+#### ファイルからのバッチモード
+テキストファイルに保存されたプロンプトから複数の動画を生成します：
+```bash
+python wan_generate_video.py --from_file prompts.txt --task t2v-14B
+--dit path/to/model.safetensors --vae path/to/vae.safetensors
+--t5 path/to/t5_model.pth --save_path output_directory
+```
+プロンプトファイルの形式：
+- 1行に1つのプロンプト
+- 空行や#で始まる行は無視されます（コメント）
+- 各行にはコマンドライン形式でプロンプト固有のパラメータを含めることができます：
+サポートされているインラインパラメータ（省略した場合、コマンドラインのデフォルト値が使用されます）
+- `--w`: 幅
+- `--h`: 高さ
+- `--f`: フレーム数
+- `--d`: シード
+- `--s`: 推論ステップ
+- `--g` または `--l`: ガイダンススケール
+- `--fs`: フローシフト
+- `--i`: 画像パス（I2V用）
+- `--cn`: コントロールパス（Fun Control用）
+- `--n`: ネガティブプロンプト
+バッチモードでは、モデルは一度だけロードされ、すべてのプロンプトで再利用されるため、複数回の単一実行と比較して全体的な生成時間が大幅に改善されます。
+#### インタラクティブモード
+プロンプトを入力するためのインタラクティブなコマンドラインインターフェース：
+```bash
+python wan_generate_video.py --interactive --task t2v-14B
+--dit path/to/model.safetensors --vae path/to/vae.safetensors
+--t5 path/to/t5_model.pth --save_path output_directory
+```
+インタラクティブモードでは：
+- コマンドラインで直接プロンプトを入力
+- バッチモードと同じインラインパラメータ形式を使用
+- 終了するには Ctrl+D (Windowsでは Ctrl+Z) を使用
+- 効率のため、モデルは生成間で読み込まれたままになります
+</details>

fpack_cache_latents.py ADDED Viewed

	@@ -0,0 +1,524 @@

+import argparse
+import logging
+import math
+import os
+from typing import List, Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+from transformers import SiglipImageProcessor, SiglipVisionModel
+from PIL import Image
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from dataset.image_video_dataset import BaseDataset, ItemInfo, save_latent_cache_framepack, ARCHITECTURE_FRAMEPACK
+from frame_pack import hunyuan
+from frame_pack.framepack_utils import load_image_encoders, load_vae
+from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from frame_pack.clip_vision import hf_clip_vision_encode
+import cache_latents
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def encode_and_save_batch(
+    vae: AutoencoderKLCausal3D,
+    feature_extractor: SiglipImageProcessor,
+    image_encoder: SiglipVisionModel,
+    batch: List[ItemInfo],
+    vanilla_sampling: bool = False,
+    one_frame: bool = False,
+    one_frame_no_2x: bool = False,
+    one_frame_no_4x: bool = False,
+):
+    """Encode a batch of original RGB videos and save FramePack section caches."""
+    if one_frame:
+        encode_and_save_batch_one_frame(
+            vae, feature_extractor, image_encoder, batch, vanilla_sampling, one_frame_no_2x, one_frame_no_4x
+        )
+        return
+    latent_window_size = batch[0].fp_latent_window_size  # all items should have the same window size
+    # Stack batch into tensor (B,C,F,H,W) in RGB order
+    contents = torch.stack([torch.from_numpy(item.content) for item in batch])
+    if len(contents.shape) == 4:
+        contents = contents.unsqueeze(1)  # B, H, W, C -> B, F, H, W, C
+    contents = contents.permute(0, 4, 1, 2, 3).contiguous()  # B, C, F, H, W
+    contents = contents.to(vae.device, dtype=vae.dtype)
+    contents = contents / 127.5 - 1.0  # normalize to [-1, 1]
+    height, width = contents.shape[3], contents.shape[4]
+    if height < 8 or width < 8:
+        item = batch[0]  # other items should have the same size
+        raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
+    # calculate latent frame count from original frame count (4n+1)
+    latent_f = (batch[0].frame_count - 1) // 4 + 1
+    # calculate the total number of sections (excluding the first frame, divided by window size)
+    total_latent_sections = math.floor((latent_f - 1) / latent_window_size)
+    if total_latent_sections < 1:
+        min_frames_needed = latent_window_size * 4 + 1
+        raise ValueError(
+            f"Not enough frames for FramePack: {batch[0].frame_count} frames ({latent_f} latent frames), minimum required: {min_frames_needed} frames ({latent_window_size+1} latent frames)"
+        )
+    # actual latent frame count (aligned to section boundaries)
+    latent_f_aligned = total_latent_sections * latent_window_size + 1 if not one_frame else 1
+    # actual video frame count
+    frame_count_aligned = (latent_f_aligned - 1) * 4 + 1
+    if frame_count_aligned != batch[0].frame_count:
+        logger.info(
+            f"Frame count mismatch: required={frame_count_aligned} != actual={batch[0].frame_count}, trimming to {frame_count_aligned}"
+        )
+        contents = contents[:, :, :frame_count_aligned, :, :]
+    latent_f = latent_f_aligned  # Update to the aligned value
+    # VAE encode (list of tensor -> stack)
+    latents = hunyuan.vae_encode(contents, vae)  # include scaling factor
+    latents = latents.to("cpu")  # (B, C, latent_f, H/8, W/8)
+    # Vision encoding per‑item (once)
+    images = np.stack([item.content[0] for item in batch], axis=0)  # B, H, W, C
+    # encode image with image encoder
+    image_embeddings = []
+    with torch.no_grad():
+        for image in images:
+            image_encoder_output = hf_clip_vision_encode(image, feature_extractor, image_encoder)
+            image_embeddings.append(image_encoder_output.last_hidden_state)
+    image_embeddings = torch.cat(image_embeddings, dim=0)  # B, LEN, 1152
+    image_embeddings = image_embeddings.to("cpu")  # Save memory
+    if not vanilla_sampling:
+        # padding is reversed for inference (future to past)
+        latent_paddings = list(reversed(range(total_latent_sections)))
+        # Note: The padding trick for inference. See the paper for details.
+        if total_latent_sections > 4:
+            latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
+        for b, item in enumerate(batch):
+            original_latent_cache_path = item.latent_cache_path
+            video_lat = latents[b : b + 1]  # keep batch dim, 1, C, F, H, W
+            # emulate inference step (history latents)
+            # Note: In inference, history_latents stores *generated* future latents.
+            # Here, for caching, we just need its shape and type for clean_* tensors.
+            # The actual content doesn't matter much as clean_* will be overwritten.
+            history_latents = torch.zeros(
+                (1, video_lat.shape[1], 1 + 2 + 16, video_lat.shape[3], video_lat.shape[4]), dtype=video_lat.dtype
+            )  # C=16 for HY
+            latent_f_index = latent_f - latent_window_size  # Start from the last section
+            section_index = total_latent_sections - 1
+            for latent_padding in latent_paddings:
+                is_last_section = section_index == 0  # the last section in inference order == the first section in time
+                latent_padding_size = latent_padding * latent_window_size
+                if is_last_section:
+                    assert latent_f_index == 1, "Last section should be starting from frame 1"
+                # indices generation (same as inference)
+                indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
+                (
+                    clean_latent_indices_pre,  # Index for start_latent
+                    blank_indices,  # Indices for padding (future context in inference)
+                    latent_indices,  # Indices for the target latents to predict
+                    clean_latent_indices_post,  # Index for the most recent history frame
+                    clean_latent_2x_indices,  # Indices for the next 2 history frames
+                    clean_latent_4x_indices,  # Indices for the next 16 history frames
+                ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
+                # Indices for clean_latents (start + recent history)
+                clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+                # clean latents preparation (emulating inference)
+                clean_latents_pre = video_lat[:, :, 0:1, :, :]  # Always the first frame (start_latent)
+                clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split(
+                    [1, 2, 16], dim=2
+                )
+                clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)  # Combine start frame + placeholder
+                # Target latents for this section (ground truth)
+                target_latents = video_lat[:, :, latent_f_index : latent_f_index + latent_window_size, :, :]
+                # save cache (file path is inside item.latent_cache_path pattern), remove batch dim
+                item.latent_cache_path = append_section_idx_to_latent_cache_path(original_latent_cache_path, section_index)
+                save_latent_cache_framepack(
+                    item_info=item,
+                    latent=target_latents.squeeze(0),  # Ground truth for this section
+                    latent_indices=latent_indices.squeeze(0),  # Indices for the ground truth section
+                    clean_latents=clean_latents.squeeze(0),  # Start frame + history placeholder
+                    clean_latent_indices=clean_latent_indices.squeeze(0),  # Indices for start frame + history placeholder
+                    clean_latents_2x=clean_latents_2x.squeeze(0),  # History placeholder
+                    clean_latent_2x_indices=clean_latent_2x_indices.squeeze(0),  # Indices for history placeholder
+                    clean_latents_4x=clean_latents_4x.squeeze(0),  # History placeholder
+                    clean_latent_4x_indices=clean_latent_4x_indices.squeeze(0),  # Indices for history placeholder
+                    image_embeddings=image_embeddings[b],
+                )
+                if is_last_section:  # If this was the first section generated in inference (time=0)
+                    # History gets the start frame + the generated first section
+                    generated_latents_for_history = video_lat[:, :, : latent_window_size + 1, :, :]
+                else:
+                    # History gets the generated current section
+                    generated_latents_for_history = target_latents  # Use true latents as stand-in for generated
+                history_latents = torch.cat([generated_latents_for_history, history_latents], dim=2)
+                section_index -= 1
+                latent_f_index -= latent_window_size
+    else:
+        # Vanilla Sampling Logic
+        for b, item in enumerate(batch):
+            original_latent_cache_path = item.latent_cache_path
+            video_lat = latents[b : b + 1]  # Keep batch dim: 1, C, F_aligned, H, W
+            img_emb = image_embeddings[b]  # LEN, 1152
+            for section_index in range(total_latent_sections):
+                target_start_f = section_index * latent_window_size + 1
+                target_end_f = target_start_f + latent_window_size
+                target_latents = video_lat[:, :, target_start_f:target_end_f, :, :]
+                start_latent = video_lat[:, :, 0:1, :, :]
+                # Clean latents preparation (Vanilla)
+                clean_latents_total_count = 1 + 2 + 16
+                history_latents = torch.zeros(
+                    size=(1, 16, clean_latents_total_count, video_lat.shape[-2], video_lat.shape[-1]),
+                    device=video_lat.device,
+                    dtype=video_lat.dtype,
+                )
+                history_start_f = 0
+                video_start_f = target_start_f - clean_latents_total_count
+                copy_count = clean_latents_total_count
+                if video_start_f < 0:
+                    history_start_f = -video_start_f
+                    copy_count = clean_latents_total_count - history_start_f
+                    video_start_f = 0
+                if copy_count > 0:
+                    history_latents[:, :, history_start_f:] = video_lat[:, :, video_start_f : video_start_f + copy_count, :, :]
+                # indices generation (Vanilla): copy from FramePack-F1
+                indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
+                (
+                    clean_latent_indices_start,
+                    clean_latent_4x_indices,
+                    clean_latent_2x_indices,
+                    clean_latent_1x_indices,
+                    latent_indices,
+                ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
+                clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
+                clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents.split([16, 2, 1], dim=2)
+                clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
+                # Save cache
+                item.latent_cache_path = append_section_idx_to_latent_cache_path(original_latent_cache_path, section_index)
+                save_latent_cache_framepack(
+                    item_info=item,
+                    latent=target_latents.squeeze(0),
+                    latent_indices=latent_indices.squeeze(0),  # Indices for target section i
+                    clean_latents=clean_latents.squeeze(0),  # Past clean frames
+                    clean_latent_indices=clean_latent_indices.squeeze(0),  # Indices for clean_latents_pre/post
+                    clean_latents_2x=clean_latents_2x.squeeze(0),  # Past clean frames (2x)
+                    clean_latent_2x_indices=clean_latent_2x_indices.squeeze(0),  # Indices for clean_latents_2x
+                    clean_latents_4x=clean_latents_4x.squeeze(0),  # Past clean frames (4x)
+                    clean_latent_4x_indices=clean_latent_4x_indices.squeeze(0),  # Indices for clean_latents_4x
+                    image_embeddings=img_emb,
+                    # Note: We don't explicitly save past_offset_indices,
+                    # but its size influences the absolute values in other indices.
+                )
+def encode_and_save_batch_one_frame(
+    vae: AutoencoderKLCausal3D,
+    feature_extractor: SiglipImageProcessor,
+    image_encoder: SiglipVisionModel,
+    batch: List[ItemInfo],
+    vanilla_sampling: bool = False,
+    one_frame_no_2x: bool = False,
+    one_frame_no_4x: bool = False,
+):
+    # item.content: target image (H, W, C)
+    # item.control_content: list of images (H, W, C)
+    # Stack batch into tensor (B,F,H,W,C) in RGB order. The numbers of control content for each item are the same.
+    contents = []
+    content_masks: list[list[Optional[torch.Tensor]]] = []
+    for item in batch:
+        item_contents = item.control_content + [item.content]
+        item_masks = []
+        for i, c in enumerate(item_contents):
+            if c.shape[-1] == 4:  # RGBA
+                item_contents[i] = c[..., :3]  # remove alpha channel from content
+                alpha = c[..., 3]  # extract alpha channel
+                mask_image = Image.fromarray(alpha, mode="L")
+                width, height = mask_image.size
+                mask_image = mask_image.resize((width // 8, height // 8), Image.LANCZOS)
+                mask_image = np.array(mask_image)  # PIL to numpy, HWC
+                mask_image = torch.from_numpy(mask_image).float() / 255.0  # 0 to 1.0, HWC
+                mask_image = mask_image.squeeze(-1)  # HWC -> HW
+                mask_image = mask_image.unsqueeze(0).unsqueeze(0).unsqueeze(0)  # HW -> 111HW (BCFHW)
+                mask_image = mask_image.to(torch.float32)
+                content_mask = mask_image
+            else:
+                content_mask = None
+            item_masks.append(content_mask)
+        item_contents = [torch.from_numpy(c) for c in item_contents]
+        contents.append(torch.stack(item_contents, dim=0))  # list of [F, H, W, C]
+        content_masks.append(item_masks)
+    contents = torch.stack(contents, dim=0)  # B, F, H, W, C. F is control frames + target frame
+    contents = contents.permute(0, 4, 1, 2, 3).contiguous()  # B, C, F, H, W
+    contents = contents.to(vae.device, dtype=vae.dtype)
+    contents = contents / 127.5 - 1.0  # normalize to [-1, 1]
+    height, width = contents.shape[-2], contents.shape[-1]
+    if height < 8 or width < 8:
+        item = batch[0]  # other items should have the same size
+        raise ValueError(f"Image or video size too small: {item.item_key} and {len(batch) - 1} more, size: {item.original_size}")
+    # VAE encode: we need to encode one frame at a time because VAE encoder has stride=4 for the time dimension except for the first frame.
+    latents = [hunyuan.vae_encode(contents[:, :, idx : idx + 1], vae).to("cpu") for idx in range(contents.shape[2])]
+    latents = torch.cat(latents, dim=2)  # B, C, F, H/8, W/8
+    # apply alphas to latents
+    for b, item in enumerate(batch):
+        for i, content_mask in enumerate(content_masks[b]):
+            if content_mask is not None:
+                # apply mask to the latents
+                # print(f"Applying content mask for item {item.item_key}, frame {i}")
+                latents[b : b + 1, :, i : i + 1] *= content_mask
+    # Vision encoding per‑item (once): use control content because it is the start image
+    images = [item.control_content[0] for item in batch]  # list of [H, W, C]
+    # encode image with image encoder
+    image_embeddings = []
+    with torch.no_grad():
+        for image in images:
+            image_encoder_output = hf_clip_vision_encode(image, feature_extractor, image_encoder)
+            image_embeddings.append(image_encoder_output.last_hidden_state)
+    image_embeddings = torch.cat(image_embeddings, dim=0)  # B, LEN, 1152
+    image_embeddings = image_embeddings.to("cpu")  # Save memory
+    # save cache for each item in the batch
+    for b, item in enumerate(batch):
+        # indices generation (same as inference): each item may have different clean_latent_indices, so we generate them per item
+        clean_latent_indices = item.fp_1f_clean_indices  # list of indices for clean latents
+        if clean_latent_indices is None or len(clean_latent_indices) == 0:
+            logger.warning(
+                f"Item {item.item_key} has no clean_latent_indices defined, using default indices for one frame training."
+            )
+            clean_latent_indices = [0]
+        if not item.fp_1f_no_post:
+            clean_latent_indices = clean_latent_indices + [1 + item.fp_latent_window_size]
+        clean_latent_indices = torch.Tensor(clean_latent_indices).long()  #  N
+        latent_index = torch.Tensor([item.fp_1f_target_index]).long()  #  1
+        # zero values is not needed to cache even if one_frame_no_2x or 4x is False
+        clean_latents_2x = None
+        clean_latents_4x = None
+        if one_frame_no_2x:
+            clean_latent_2x_indices = None
+        else:
+            index = 1 + item.fp_latent_window_size + 1
+            clean_latent_2x_indices = torch.arange(index, index + 2)  #  2
+        if one_frame_no_4x:
+            clean_latent_4x_indices = None
+        else:
+            index = 1 + item.fp_latent_window_size + 1 + 2
+            clean_latent_4x_indices = torch.arange(index, index + 16)  #  16
+        # clean latents preparation (emulating inference)
+        clean_latents = latents[b, :, :-1]  # C, F, H, W
+        if not item.fp_1f_no_post:
+            # If zero post is enabled, we need to add a zero frame at the end
+            clean_latents = F.pad(clean_latents, (0, 0, 0, 0, 0, 1), value=0.0)  # C, F+1, H, W
+        # Target latents for this section (ground truth)
+        target_latents = latents[b, :, -1:]  # C, 1, H, W
+        print(f"Saving cache for item {item.item_key} at {item.latent_cache_path}. no_post: {item.fp_1f_no_post}")
+        print(f"  Clean latent indices: {clean_latent_indices}, latent index: {latent_index}")
+        print(f"  Clean latents: {clean_latents.shape}, target latents: {target_latents.shape}")
+        print(f"  Clean latents 2x indices: {clean_latent_2x_indices}, clean latents 4x indices: {clean_latent_4x_indices}")
+        print(
+            f"  Clean latents 2x: {clean_latents_2x.shape if clean_latents_2x is not None else 'None'}, "
+            f"Clean latents 4x: {clean_latents_4x.shape if clean_latents_4x is not None else 'None'}"
+        )
+        print(f"  Image embeddings: {image_embeddings[b].shape}")
+        # save cache (file path is inside item.latent_cache_path pattern), remove batch dim
+        save_latent_cache_framepack(
+            item_info=item,
+            latent=target_latents,  # Ground truth for this section
+            latent_indices=latent_index,  # Indices for the ground truth section
+            clean_latents=clean_latents,  # Start frame + history placeholder
+            clean_latent_indices=clean_latent_indices,  # Indices for start frame + history placeholder
+            clean_latents_2x=clean_latents_2x,  # History placeholder
+            clean_latent_2x_indices=clean_latent_2x_indices,  # Indices for history placeholder
+            clean_latents_4x=clean_latents_4x,  # History placeholder
+            clean_latent_4x_indices=clean_latent_4x_indices,  # Indices for history placeholder
+            image_embeddings=image_embeddings[b],
+        )
+def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument("--image_encoder", type=str, required=True, help="Image encoder (CLIP) checkpoint path or directory")
+    parser.add_argument(
+        "--f1",
+        action="store_true",
+        help="Generate cache for F1 model (vanilla (autoregressive) sampling) instead of Inverted anti-drifting (plain FramePack)",
+    )
+    parser.add_argument(
+        "--one_frame",
+        action="store_true",
+        help="Generate cache for one frame training (single frame, single section). latent_window_size is used as the index of the target frame.",
+    )
+    parser.add_argument(
+        "--one_frame_no_2x",
+        action="store_true",
+        help="Do not use clean_latents_2x and clean_latent_2x_indices for one frame training.",
+    )
+    parser.add_argument(
+        "--one_frame_no_4x",
+        action="store_true",
+        help="Do not use clean_latents_4x and clean_latent_4x_indices for one frame training.",
+    )
+    return parser
+def main(args: argparse.Namespace):
+    device = args.device if hasattr(args, "device") and args.device else ("cuda" if torch.cuda.is_available() else "cpu")
+    device = torch.device(device)
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_FRAMEPACK)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    datasets = train_dataset_group.datasets
+    if args.debug_mode is not None:
+        cache_latents.show_datasets(
+            datasets, args.debug_mode, args.console_width, args.console_back, args.console_num_images, fps=16
+        )
+        return
+    assert args.vae is not None, "vae checkpoint is required"
+    logger.info(f"Loading VAE model from {args.vae}")
+    vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device=device)
+    vae.to(device)
+    logger.info(f"Loading image encoder from {args.image_encoder}")
+    feature_extractor, image_encoder = load_image_encoders(args)
+    image_encoder.eval()
+    image_encoder.to(device)
+    logger.info(f"Cache generation mode: {'Vanilla Sampling' if args.f1 else 'Inference Emulation'}")
+    # encoding closure
+    def encode(batch: List[ItemInfo]):
+        encode_and_save_batch(
+            vae, feature_extractor, image_encoder, batch, args.f1, args.one_frame, args.one_frame_no_2x, args.one_frame_no_4x
+        )
+    # reuse core loop from cache_latents with no change
+    encode_datasets_framepack(datasets, encode, args)
+def append_section_idx_to_latent_cache_path(latent_cache_path: str, section_idx: int) -> str:
+    tokens = latent_cache_path.split("_")
+    tokens[-3] = f"{tokens[-3]}-{section_idx:04d}"  # append section index to "frame_pos-count"
+    return "_".join(tokens)
+def encode_datasets_framepack(datasets: list[BaseDataset], encode: callable, args: argparse.Namespace):
+    num_workers = args.num_workers if args.num_workers is not None else max(1, os.cpu_count() - 1)
+    for i, dataset in enumerate(datasets):
+        logger.info(f"Encoding dataset [{i}]")
+        all_latent_cache_paths = []
+        for _, batch in tqdm(dataset.retrieve_latent_cache_batches(num_workers)):
+            batch: list[ItemInfo] = batch  # type: ignore
+            # latent_cache_path is "{basename}_{w:04d}x{h:04d}_{self.architecture}.safetensors"
+            # For video dataset,we expand it to "{basename}_{section_idx:04d}_{w:04d}x{h:04d}_{self.architecture}.safetensors"
+            filtered_batch = []
+            for item in batch:
+                if item.frame_count is None:
+                    # image dataset
+                    all_latent_cache_paths.append(item.latent_cache_path)
+                    all_existing = os.path.exists(item.latent_cache_path)
+                else:
+                    latent_f = (item.frame_count - 1) // 4 + 1
+                    num_sections = max(1, math.floor((latent_f - 1) / item.fp_latent_window_size))  # min 1 section
+                    all_existing = True
+                    for sec in range(num_sections):
+                        p = append_section_idx_to_latent_cache_path(item.latent_cache_path, sec)
+                        all_latent_cache_paths.append(p)
+                        all_existing = all_existing and os.path.exists(p)
+                if not all_existing:  # if any section cache is missing
+                    filtered_batch.append(item)
+            if args.skip_existing:
+                if len(filtered_batch) == 0:  # all sections exist
+                    logger.info(f"All sections exist for {batch[0].item_key}, skipping")
+                    continue
+                batch = filtered_batch  # update batch to only missing sections
+            bs = args.batch_size if args.batch_size is not None else len(batch)
+            for i in range(0, len(batch), bs):
+                encode(batch[i : i + bs])
+        # normalize paths
+        all_latent_cache_paths = [os.path.normpath(p) for p in all_latent_cache_paths]
+        all_latent_cache_paths = set(all_latent_cache_paths)
+        # remove old cache files not in the dataset
+        all_cache_files = dataset.get_all_latent_cache_files()
+        for cache_file in all_cache_files:
+            if os.path.normpath(cache_file) not in all_latent_cache_paths:
+                if args.keep_cache:
+                    logger.info(f"Keep cache file not in the dataset: {cache_file}")
+                else:
+                    os.remove(cache_file)
+                    logger.info(f"Removed old cache file: {cache_file}")
+if __name__ == "__main__":
+    parser = cache_latents.setup_parser_common()
+    parser = cache_latents.hv_setup_parser(parser)  # VAE
+    parser = framepack_setup_parser(parser)
+    args = parser.parse_args()
+    if args.vae_dtype is not None:
+        raise ValueError("VAE dtype is not supported in FramePack")
+    # if args.batch_size != 1:
+    #     args.batch_size = 1
+    #     logger.info("Batch size is set to 1 for FramePack.")
+    main(args)

fpack_cache_text_encoder_outputs.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import argparse
+import os
+from typing import Optional, Union
+import numpy as np
+import torch
+from tqdm import tqdm
+from transformers import LlamaTokenizerFast, LlamaModel, CLIPTokenizer, CLIPTextModel
+from dataset import config_utils
+from dataset.config_utils import BlueprintGenerator, ConfigSanitizer
+from dataset.image_video_dataset import ARCHITECTURE_FRAMEPACK, ItemInfo, save_text_encoder_output_cache_framepack
+import cache_text_encoder_outputs
+from frame_pack import hunyuan
+from frame_pack.framepack_utils import load_text_encoder1, load_text_encoder2
+import logging
+from frame_pack.utils import crop_or_pad_yield_mask
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def encode_and_save_batch(
+    tokenizer1: LlamaTokenizerFast,
+    text_encoder1: LlamaModel,
+    tokenizer2: CLIPTokenizer,
+    text_encoder2: CLIPTextModel,
+    batch: list[ItemInfo],
+    device: torch.device,
+):
+    prompts = [item.caption for item in batch]
+    # encode prompt
+    # FramePack's encode_prompt_conds only supports single prompt, so we need to encode each prompt separately
+    list_of_llama_vec = []
+    list_of_llama_attention_mask = []
+    list_of_clip_l_pooler = []
+    for prompt in prompts:
+        with torch.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
+            # llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(prompts, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
+            llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(prompt, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
+            llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
+        list_of_llama_vec.append(llama_vec.squeeze(0))
+        list_of_llama_attention_mask.append(llama_attention_mask.squeeze(0))
+        list_of_clip_l_pooler.append(clip_l_pooler.squeeze(0))
+    # save prompt cache
+    for item, llama_vec, llama_attention_mask, clip_l_pooler in zip(
+        batch, list_of_llama_vec, list_of_llama_attention_mask, list_of_clip_l_pooler
+    ):
+        # save llama_vec and clip_l_pooler to cache
+        save_text_encoder_output_cache_framepack(item, llama_vec, llama_attention_mask, clip_l_pooler)
+def main(args):
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    # Load dataset config
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer())
+    logger.info(f"Load dataset config from {args.dataset_config}")
+    user_config = config_utils.load_user_config(args.dataset_config)
+    blueprint = blueprint_generator.generate(user_config, args, architecture=ARCHITECTURE_FRAMEPACK)
+    train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    datasets = train_dataset_group.datasets
+    # prepare cache files and paths: all_cache_files_for_dataset = exisiting cache files, all_cache_paths_for_dataset = all cache paths in the dataset
+    all_cache_files_for_dataset, all_cache_paths_for_dataset = cache_text_encoder_outputs.prepare_cache_files_and_paths(datasets)
+    # load text encoder
+    tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, device)
+    tokenizer2, text_encoder2 = load_text_encoder2(args)
+    text_encoder2.to(device)
+    # Encode with Text Encoders
+    logger.info("Encoding with Text Encoders")
+    def encode_for_text_encoder(batch: list[ItemInfo]):
+        encode_and_save_batch(tokenizer1, text_encoder1, tokenizer2, text_encoder2, batch, device)
+    cache_text_encoder_outputs.process_text_encoder_batches(
+        args.num_workers,
+        args.skip_existing,
+        args.batch_size,
+        datasets,
+        all_cache_files_for_dataset,
+        all_cache_paths_for_dataset,
+        encode_for_text_encoder,
+    )
+    # remove cache files not in dataset
+    cache_text_encoder_outputs.post_process_cache_files(datasets, all_cache_files_for_dataset, all_cache_paths_for_dataset, args.keep_cache)
+def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory")
+    parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
+    return parser
+if __name__ == "__main__":
+    parser = cache_text_encoder_outputs.setup_parser_common()
+    parser = framepack_setup_parser(parser)
+    args = parser.parse_args()
+    main(args)

fpack_generate_video.py ADDED Viewed

	@@ -0,0 +1,1832 @@

+import argparse
+from datetime import datetime
+import gc
+import json
+import random
+import os
+import re
+import time
+import math
+import copy
+from typing import Tuple, Optional, List, Union, Any, Dict
+import torch
+from safetensors.torch import load_file, save_file
+from safetensors import safe_open
+from PIL import Image
+import cv2
+import numpy as np
+import torchvision.transforms.functional as TF
+from transformers import LlamaModel
+from tqdm import tqdm
+from networks import lora_framepack
+from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from frame_pack import hunyuan
+from frame_pack.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked, load_packed_model
+from frame_pack.utils import crop_or_pad_yield_mask, resize_and_center_crop, soft_append_bcthw
+from frame_pack.bucket_tools import find_nearest_bucket
+from frame_pack.clip_vision import hf_clip_vision_encode
+from frame_pack.k_diffusion_hunyuan import sample_hunyuan
+from dataset import image_video_dataset
+try:
+    from lycoris.kohya import create_network_from_weights
+except:
+    pass
+from utils.device_utils import clean_memory_on_device
+from hv_generate_video import save_images_grid, save_videos_grid, synchronize_device
+from wan_generate_video import merge_lora_weights
+from frame_pack.framepack_utils import load_vae, load_text_encoder1, load_text_encoder2, load_image_encoders
+from dataset.image_video_dataset import load_video
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+class GenerationSettings:
+    def __init__(self, device: torch.device, dit_weight_dtype: Optional[torch.dtype] = None):
+        self.device = device
+        self.dit_weight_dtype = dit_weight_dtype  # not used currently because model may be optimized
+def parse_args() -> argparse.Namespace:
+    """parse command line arguments"""
+    parser = argparse.ArgumentParser(description="Wan 2.1 inference script")
+    # WAN arguments
+    # parser.add_argument("--ckpt_dir", type=str, default=None, help="The path to the checkpoint directory (Wan 2.1 official).")
+    parser.add_argument(
+        "--sample_solver", type=str, default="unipc", choices=["unipc", "dpm++", "vanilla"], help="The solver used to sample."
+    )
+    parser.add_argument("--dit", type=str, default=None, help="DiT directory or path")
+    parser.add_argument("--vae", type=str, default=None, help="VAE directory or path")
+    parser.add_argument("--text_encoder1", type=str, required=True, help="Text Encoder 1 directory or path")
+    parser.add_argument("--text_encoder2", type=str, required=True, help="Text Encoder 2 directory or path")
+    parser.add_argument("--image_encoder", type=str, required=True, help="Image Encoder directory or path")
+    parser.add_argument("--f1", action="store_true", help="Use F1 sampling method")
+    # LoRA
+    parser.add_argument("--lora_weight", type=str, nargs="*", required=False, default=None, help="LoRA weight path")
+    parser.add_argument("--lora_multiplier", type=float, nargs="*", default=1.0, help="LoRA multiplier")
+    parser.add_argument("--include_patterns", type=str, nargs="*", default=None, help="LoRA module include patterns")
+    parser.add_argument("--exclude_patterns", type=str, nargs="*", default=None, help="LoRA module exclude patterns")
+    parser.add_argument(
+        "--save_merged_model",
+        type=str,
+        default=None,
+        help="Save merged model to path. If specified, no inference will be performed.",
+    )
+    # inference
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default=None,
+        help="prompt for generation. If `;;;` is used, it will be split into sections. Example: `section_index:prompt` or "
+        "`section_index:prompt;;;section_index:prompt;;;...`, section_index can be `0` or `-1` or `0-2`, `-1` means last section, `0-2` means from 0 to 2 (inclusive).",
+    )
+    parser.add_argument(
+        "--negative_prompt",
+        type=str,
+        default=None,
+        help="negative prompt for generation, default is empty string. should not change.",
+    )
+    parser.add_argument(
+        "--custom_system_prompt",
+        type=str,
+        default=None,
+        help="Custom system prompt for LLM. If specified, it will override the default system prompt. See hunyuan_model/text_encoder.py for the default system prompt.",
+    )
+    parser.add_argument("--video_size", type=int, nargs=2, default=[256, 256], help="video size, height and width")
+    parser.add_argument("--video_seconds", type=float, default=5.0, help="video length, default is 5.0 seconds")
+    parser.add_argument(
+        "--video_sections",
+        type=int,
+        default=None,
+        help="number of video sections, Default is None (auto calculate from video seconds)",
+    )
+    parser.add_argument(
+        "--one_frame_inference",
+        type=str,
+        default=None,
+        help="one frame inference, default is None, comma separated values from 'no_2x', 'no_4x', 'no_post', 'control_indices' and 'target_index'.",
+    )
+    parser.add_argument(
+        "--control_image_path", type=str, default=None, nargs="*", help="path to control (reference) image for one frame inference."
+    )
+    parser.add_argument(
+        "--control_image_mask_path",
+        type=str,
+        default=None,
+        nargs="*",
+        help="path to control (reference) image mask for one frame inference.",
+    )
+    parser.add_argument("--fps", type=int, default=30, help="video fps, default is 30")
+    parser.add_argument("--infer_steps", type=int, default=25, help="number of inference steps, default is 25")
+    parser.add_argument("--save_path", type=str, required=True, help="path to save generated video")
+    parser.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
+    # parser.add_argument(
+    #     "--cpu_noise", action="store_true", help="Use CPU to generate noise (compatible with ComfyUI). Default is False."
+    # )
+    parser.add_argument("--latent_window_size", type=int, default=9, help="latent window size, default is 9. should not change.")
+    parser.add_argument(
+        "--embedded_cfg_scale", type=float, default=10.0, help="Embeded CFG scale (distilled CFG Scale), default is 10.0"
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=1.0,
+        help="Guidance scale for classifier free guidance. Default is 1.0 (no guidance), should not change.",
+    )
+    parser.add_argument("--guidance_rescale", type=float, default=0.0, help="CFG Re-scale, default is 0.0. Should not change.")
+    # parser.add_argument("--video_path", type=str, default=None, help="path to video for video2video inference")
+    parser.add_argument(
+        "--image_path",
+        type=str,
+        default=None,
+        help="path to image for image2video inference. If `;;;` is used, it will be used as section images. The notation is same as `--prompt`.",
+    )
+    parser.add_argument("--end_image_path", type=str, default=None, help="path to end image for image2video inference")
+    parser.add_argument(
+        "--latent_paddings",
+        type=str,
+        default=None,
+        help="latent paddings for each section, comma separated values. default is None (FramePack default paddings)",
+    )
+    # parser.add_argument(
+    #     "--control_path",
+    #     type=str,
+    #     default=None,
+    #     help="path to control video for inference with controlnet. video file or directory with images",
+    # )
+    # parser.add_argument("--trim_tail_frames", type=int, default=0, help="trim tail N frames from the video before saving")
+    # # Flow Matching
+    # parser.add_argument(
+    #     "--flow_shift",
+    #     type=float,
+    #     default=None,
+    #     help="Shift factor for flow matching schedulers. Default depends on task.",
+    # )
+    parser.add_argument("--fp8", action="store_true", help="use fp8 for DiT model")
+    parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT, only for fp8")
+    # parser.add_argument("--fp8_fast", action="store_true", help="Enable fast FP8 arithmetic (RTX 4XXX+), only for fp8_scaled")
+    parser.add_argument(
+        "--rope_scaling_factor", type=float, default=0.5, help="RoPE scaling factor for high resolution (H/W), default is 0.5"
+    )
+    parser.add_argument(
+        "--rope_scaling_timestep_threshold",
+        type=int,
+        default=None,
+        help="RoPE scaling timestep threshold, default is None (disable), if set, RoPE scaling will be applied only for timesteps >= threshold, around 800 is good starting point",
+    )
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for Text Encoder 1 (LLM)")
+    parser.add_argument(
+        "--device", type=str, default=None, help="device to use for inference. If None, use CUDA if available, otherwise use CPU"
+    )
+    parser.add_argument(
+        "--attn_mode",
+        type=str,
+        default="torch",
+        choices=["flash", "torch", "sageattn", "xformers", "sdpa"],  #  "flash2", "flash3",
+        help="attention mode",
+    )
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    parser.add_argument("--bulk_decode", action="store_true", help="decode all frames at once")
+    parser.add_argument("--blocks_to_swap", type=int, default=0, help="number of blocks to swap in the model")
+    parser.add_argument(
+        "--output_type",
+        type=str,
+        default="video",
+        choices=["video", "images", "latent", "both", "latent_images"],
+        help="output type",
+    )
+    parser.add_argument("--no_metadata", action="store_true", help="do not save metadata")
+    parser.add_argument("--latent_path", type=str, nargs="*", default=None, help="path to latent for decode. no inference")
+    parser.add_argument("--lycoris", action="store_true", help="use lycoris for inference")
+    # parser.add_argument("--compile", action="store_true", help="Enable torch.compile")
+    # parser.add_argument(
+    #     "--compile_args",
+    #     nargs=4,
+    #     metavar=("BACKEND", "MODE", "DYNAMIC", "FULLGRAPH"),
+    #     default=["inductor", "max-autotune-no-cudagraphs", "False", "False"],
+    #     help="Torch.compile settings",
+    # )
+    # New arguments for batch and interactive modes
+    parser.add_argument("--from_file", type=str, default=None, help="Read prompts from a file")
+    parser.add_argument("--interactive", action="store_true", help="Interactive mode: read prompts from console")
+    args = parser.parse_args()
+    # Validate arguments
+    if args.from_file and args.interactive:
+        raise ValueError("Cannot use both --from_file and --interactive at the same time")
+    if args.latent_path is None or len(args.latent_path) == 0:
+        if args.prompt is None and not args.from_file and not args.interactive:
+            raise ValueError("Either --prompt, --from_file or --interactive must be specified")
+    return args
+def parse_prompt_line(line: str) -> Dict[str, Any]:
+    """Parse a prompt line into a dictionary of argument overrides
+    Args:
+        line: Prompt line with options
+    Returns:
+        Dict[str, Any]: Dictionary of argument overrides
+    """
+    # TODO common function with hv_train_network.line_to_prompt_dict
+    parts = line.split(" --")
+    prompt = parts[0].strip()
+    # Create dictionary of overrides
+    overrides = {"prompt": prompt}
+    # Initialize control_image_path and control_image_mask_path as a list to accommodate multiple paths
+    overrides["control_image_path"] = []
+    overrides["control_image_mask_path"] = []
+    for part in parts[1:]:
+        if not part.strip():
+            continue
+        option_parts = part.split(" ", 1)
+        option = option_parts[0].strip()
+        value = option_parts[1].strip() if len(option_parts) > 1 else ""
+        # Map options to argument names
+        if option == "w":
+            overrides["video_size_width"] = int(value)
+        elif option == "h":
+            overrides["video_size_height"] = int(value)
+        elif option == "f":
+            overrides["video_seconds"] = float(value)
+        elif option == "d":
+            overrides["seed"] = int(value)
+        elif option == "s":
+            overrides["infer_steps"] = int(value)
+        elif option == "g" or option == "l":
+            overrides["guidance_scale"] = float(value)
+        # elif option == "fs":
+        #     overrides["flow_shift"] = float(value)
+        elif option == "i":
+            overrides["image_path"] = value
+        # elif option == "im":
+        #     overrides["image_mask_path"] = value
+        # elif option == "cn":
+        #     overrides["control_path"] = value
+        elif option == "n":
+            overrides["negative_prompt"] = value
+        elif option == "vs":  # video_sections
+            overrides["video_sections"] = int(value)
+        elif option == "ei":  # end_image_path
+            overrides["end_image_path"] = value
+        elif option == "ci":  # control_image_path
+            overrides["control_image_path"].append(value)
+        elif option == "cim":  # control_image_mask_path
+            overrides["control_image_mask_path"].append(value)
+        elif option == "of":  # one_frame_inference
+            overrides["one_frame_inference"] = value
+    # If no control_image_path was provided, remove the empty list
+    if not overrides["control_image_path"]:
+        del overrides["control_image_path"]
+    if not overrides["control_image_mask_path"]:
+        del overrides["control_image_mask_path"]
+    return overrides
+def apply_overrides(args: argparse.Namespace, overrides: Dict[str, Any]) -> argparse.Namespace:
+    """Apply overrides to args
+    Args:
+        args: Original arguments
+        overrides: Dictionary of overrides
+    Returns:
+        argparse.Namespace: New arguments with overrides applied
+    """
+    args_copy = copy.deepcopy(args)
+    for key, value in overrides.items():
+        if key == "video_size_width":
+            args_copy.video_size[1] = value
+        elif key == "video_size_height":
+            args_copy.video_size[0] = value
+        else:
+            setattr(args_copy, key, value)
+    return args_copy
+def check_inputs(args: argparse.Namespace) -> Tuple[int, int, int]:
+    """Validate video size and length
+    Args:
+        args: command line arguments
+    Returns:
+        Tuple[int, int, float]: (height, width, video_seconds)
+    """
+    height = args.video_size[0]
+    width = args.video_size[1]
+    video_seconds = args.video_seconds
+    if args.video_sections is not None:
+        video_seconds = (args.video_sections * (args.latent_window_size * 4) + 1) / args.fps
+    if height % 8 != 0 or width % 8 != 0:
+        raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    return height, width, video_seconds
+# region DiT model
+def load_dit_model(args: argparse.Namespace, device: torch.device) -> HunyuanVideoTransformer3DModelPacked:
+    """load DiT model
+    Args:
+        args: command line arguments
+        device: device to use
+        dit_dtype: data type for the model
+        dit_weight_dtype: data type for the model weights. None for as-is
+    Returns:
+        HunyuanVideoTransformer3DModelPacked: DiT model
+    """
+    loading_device = "cpu"
+    if args.blocks_to_swap == 0 and not args.fp8_scaled and args.lora_weight is None:
+        loading_device = device
+    # do not fp8 optimize because we will merge LoRA weights
+    model = load_packed_model(device, args.dit, args.attn_mode, loading_device)
+    # apply RoPE scaling factor
+    if args.rope_scaling_timestep_threshold is not None:
+        logger.info(
+            f"Applying RoPE scaling factor {args.rope_scaling_factor} for timesteps >= {args.rope_scaling_timestep_threshold}"
+        )
+        model.enable_rope_scaling(args.rope_scaling_timestep_threshold, args.rope_scaling_factor)
+    return model
+def optimize_model(model: HunyuanVideoTransformer3DModelPacked, args: argparse.Namespace, device: torch.device) -> None:
+    """optimize the model (FP8 conversion, device move etc.)
+    Args:
+        model: dit model
+        args: command line arguments
+        device: device to use
+    """
+    if args.fp8_scaled:
+        # load state dict as-is and optimize to fp8
+        state_dict = model.state_dict()
+        # if no blocks to swap, we can move the weights to GPU after optimization on GPU (omit redundant CPU->GPU copy)
+        move_to_device = args.blocks_to_swap == 0  # if blocks_to_swap > 0, we will keep the model on CPU
+        state_dict = model.fp8_optimization(state_dict, device, move_to_device, use_scaled_mm=False)  # args.fp8_fast)
+        info = model.load_state_dict(state_dict, strict=True, assign=True)
+        logger.info(f"Loaded FP8 optimized weights: {info}")
+        if args.blocks_to_swap == 0:
+            model.to(device)  # make sure all parameters are on the right device (e.g. RoPE etc.)
+    else:
+        # simple cast to dit_dtype
+        target_dtype = None  # load as-is (dit_weight_dtype == dtype of the weights in state_dict)
+        target_device = None
+        if args.fp8:
+            target_dtype = torch.float8e4m3fn
+        if args.blocks_to_swap == 0:
+            logger.info(f"Move model to device: {device}")
+            target_device = device
+        if target_device is not None and target_dtype is not None:
+            model.to(target_device, target_dtype)  # move and cast  at the same time. this reduces redundant copy operations
+    # if args.compile:
+    #     compile_backend, compile_mode, compile_dynamic, compile_fullgraph = args.compile_args
+    #     logger.info(
+    #         f"Torch Compiling[Backend: {compile_backend}; Mode: {compile_mode}; Dynamic: {compile_dynamic}; Fullgraph: {compile_fullgraph}]"
+    #     )
+    #     torch._dynamo.config.cache_size_limit = 32
+    #     for i in range(len(model.blocks)):
+    #         model.blocks[i] = torch.compile(
+    #             model.blocks[i],
+    #             backend=compile_backend,
+    #             mode=compile_mode,
+    #             dynamic=compile_dynamic.lower() in "true",
+    #             fullgraph=compile_fullgraph.lower() in "true",
+    #         )
+    if args.blocks_to_swap > 0:
+        logger.info(f"Enable swap {args.blocks_to_swap} blocks to CPU from device: {device}")
+        model.enable_block_swap(args.blocks_to_swap, device, supports_backward=False)
+        model.move_to_device_except_swap_blocks(device)
+        model.prepare_block_swap_before_forward()
+    else:
+        # make sure the model is on the right device
+        model.to(device)
+    model.eval().requires_grad_(False)
+    clean_memory_on_device(device)
+# endregion
+def decode_latent(
+    latent_window_size: int,
+    total_latent_sections: int,
+    bulk_decode: bool,
+    vae: AutoencoderKLCausal3D,
+    latent: torch.Tensor,
+    device: torch.device,
+    one_frame_inference_mode: bool = False,
+) -> torch.Tensor:
+    logger.info(f"Decoding video...")
+    if latent.ndim == 4:
+        latent = latent.unsqueeze(0)  # add batch dimension
+    vae.to(device)
+    if not bulk_decode and not one_frame_inference_mode:
+        latent_window_size = latent_window_size  # default is 9
+        # total_latent_sections = (args.video_seconds * 30) / (latent_window_size * 4)
+        # total_latent_sections = int(max(round(total_latent_sections), 1))
+        num_frames = latent_window_size * 4 - 3
+        latents_to_decode = []
+        latent_frame_index = 0
+        for i in range(total_latent_sections - 1, -1, -1):
+            is_last_section = i == total_latent_sections - 1
+            generated_latent_frames = (num_frames + 3) // 4 + (1 if is_last_section else 0)
+            section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
+            section_latent = latent[:, :, latent_frame_index : latent_frame_index + section_latent_frames, :, :]
+            if section_latent.shape[2] > 0:
+                latents_to_decode.append(section_latent)
+            latent_frame_index += generated_latent_frames
+        latents_to_decode = latents_to_decode[::-1]  # reverse the order of latents to decode
+        history_pixels = None
+        for latent in tqdm(latents_to_decode):
+            if history_pixels is None:
+                history_pixels = hunyuan.vae_decode(latent, vae).cpu()
+            else:
+                overlapped_frames = latent_window_size * 4 - 3
+                current_pixels = hunyuan.vae_decode(latent, vae).cpu()
+                history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
+            clean_memory_on_device(device)
+    else:
+        # bulk decode
+        logger.info(f"Bulk decoding or one frame inference")
+        if not one_frame_inference_mode:
+            history_pixels = hunyuan.vae_decode(latent, vae).cpu()  # normal
+        else:
+            # one frame inference
+            history_pixels = [hunyuan.vae_decode(latent[:, :, i : i + 1, :, :], vae).cpu() for i in range(latent.shape[2])]
+            history_pixels = torch.cat(history_pixels, dim=2)
+    vae.to("cpu")
+    logger.info(f"Decoded. Pixel shape {history_pixels.shape}")
+    return history_pixels[0]  # remove batch dimension
+def prepare_i2v_inputs(
+    args: argparse.Namespace,
+    device: torch.device,
+    vae: AutoencoderKLCausal3D,
+    shared_models: Optional[Dict] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+    """Prepare inputs for I2V
+    Args:
+        args: command line arguments
+        config: model configuration
+        device: device to use
+        vae: VAE model, used for image encoding
+        shared_models: dictionary containing pre-loaded models
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[dict, dict]]:
+            (noise, context, context_null, y, (arg_c, arg_null))
+    """
+    height, width, video_seconds = check_inputs(args)
+    # define parsing function
+    def parse_section_strings(input_string: str) -> dict[int, str]:
+        section_strings = {}
+        if ";;;" in input_string:
+            split_section_strings = input_string.split(";;;")
+            for section_str in split_section_strings:
+                if ":" not in section_str:
+                    start = end = 0
+                    section_str = section_str.strip()
+                else:
+                    index_str, section_str = section_str.split(":", 1)
+                    index_str = index_str.strip()
+                    section_str = section_str.strip()
+                    m = re.match(r"^(-?\d+)(-\d+)?$", index_str)
+                    if m:
+                        start = int(m.group(1))
+                        end = int(m.group(2)[1:]) if m.group(2) is not None else start
+                    else:
+                        start = end = 0
+                        section_str = section_str.strip()
+                for i in range(start, end + 1):
+                    section_strings[i] = section_str
+        else:
+            section_strings[0] = input_string
+        # assert 0 in section_prompts, "Section prompts must contain section 0"
+        if 0 not in section_strings:
+            # use smallest section index. prefer positive index over negative index
+            # if all section indices are negative, use the smallest negative index
+            indices = list(section_strings.keys())
+            if all(i < 0 for i in indices):
+                section_index = min(indices)
+            else:
+                section_index = min(i for i in indices if i >= 0)
+            section_strings[0] = section_strings[section_index]
+        return section_strings
+    # prepare image
+    def preprocess_image(image_path: str):
+        image = Image.open(image_path)
+        if image.mode == "RGBA":
+            alpha = image.split()[-1]
+        else:
+            alpha = None
+        image = image.convert("RGB")
+        image_np = np.array(image)  # PIL to numpy, HWC
+        image_np = image_video_dataset.resize_image_to_bucket(image_np, (width, height))
+        image_tensor = torch.from_numpy(image_np).float() / 127.5 - 1.0  # -1 to 1.0, HWC
+        image_tensor = image_tensor.permute(2, 0, 1)[None, :, None]  # HWC -> CHW -> NCFHW, N=1, C=3, F=1
+        return image_tensor, image_np, alpha
+    section_image_paths = parse_section_strings(args.image_path)
+    section_images = {}
+    for index, image_path in section_image_paths.items():
+        img_tensor, img_np, _ = preprocess_image(image_path)
+        section_images[index] = (img_tensor, img_np)
+    # check end image
+    if args.end_image_path is not None:
+        end_image_tensor, _, _ = preprocess_image(args.end_image_path)
+    else:
+        end_image_tensor = None
+    # check end images
+    if args.control_image_path is not None and len(args.control_image_path) > 0:
+        control_image_tensors = []
+        control_mask_images = []
+        for ctrl_image_path in args.control_image_path:
+            control_image_tensor, _, control_mask = preprocess_image(ctrl_image_path)
+            control_image_tensors.append(control_image_tensor)
+            control_mask_images.append(control_mask)
+    else:
+        control_image_tensors = None
+        control_mask_images = None
+    # configure negative prompt
+    n_prompt = args.negative_prompt if args.negative_prompt else ""
+    # parse section prompts
+    section_prompts = parse_section_strings(args.prompt)
+    # load text encoder
+    if shared_models is not None:
+        tokenizer1, text_encoder1 = shared_models["tokenizer1"], shared_models["text_encoder1"]
+        tokenizer2, text_encoder2 = shared_models["tokenizer2"], shared_models["text_encoder2"]
+        text_encoder1.to(device)
+    else:
+        tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, device)
+        tokenizer2, text_encoder2 = load_text_encoder2(args)
+    text_encoder2.to(device)
+    logger.info(f"Encoding prompt")
+    llama_vecs = {}
+    llama_attention_masks = {}
+    clip_l_poolers = {}
+    with torch.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
+        for index, prompt in section_prompts.items():
+            llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(
+                prompt, text_encoder1, text_encoder2, tokenizer1, tokenizer2, custom_system_prompt=args.custom_system_prompt
+            )
+            llama_vec = llama_vec.cpu()
+            clip_l_pooler = clip_l_pooler.cpu()
+            llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
+            llama_vecs[index] = llama_vec
+            llama_attention_masks[index] = llama_attention_mask
+            clip_l_poolers[index] = clip_l_pooler
+    if args.guidance_scale == 1.0:
+        llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vecs[0]), torch.zeros_like(clip_l_poolers[0])
+    else:
+        with torch.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
+            llama_vec_n, clip_l_pooler_n = hunyuan.encode_prompt_conds(
+                n_prompt, text_encoder1, text_encoder2, tokenizer1, tokenizer2, custom_system_prompt=args.custom_system_prompt
+            )
+            llama_vec_n = llama_vec_n.cpu()
+            clip_l_pooler_n = clip_l_pooler_n.cpu()
+    llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
+    # free text encoder and clean memory
+    if shared_models is not None:  # if shared models are used, do not free them but move to CPU
+        text_encoder1.to("cpu")
+        text_encoder2.to("cpu")
+    del tokenizer1, text_encoder1, tokenizer2, text_encoder2  # do not free shared models
+    clean_memory_on_device(device)
+    # load image encoder
+    if shared_models is not None:
+        feature_extractor, image_encoder = shared_models["feature_extractor"], shared_models["image_encoder"]
+    else:
+        feature_extractor, image_encoder = load_image_encoders(args)
+    image_encoder.to(device)
+    # encode image with image encoder
+    section_image_encoder_last_hidden_states = {}
+    for index, (img_tensor, img_np) in section_images.items():
+        with torch.no_grad():
+            image_encoder_output = hf_clip_vision_encode(img_np, feature_extractor, image_encoder)
+        image_encoder_last_hidden_state = image_encoder_output.last_hidden_state.cpu()
+        section_image_encoder_last_hidden_states[index] = image_encoder_last_hidden_state
+    # free image encoder and clean memory
+    if shared_models is not None:
+        image_encoder.to("cpu")
+    del image_encoder, feature_extractor
+    clean_memory_on_device(device)
+    # VAE encoding
+    logger.info(f"Encoding image to latent space")
+    vae.to(device)
+    section_start_latents = {}
+    for index, (img_tensor, img_np) in section_images.items():
+        start_latent = hunyuan.vae_encode(img_tensor, vae).cpu()
+        section_start_latents[index] = start_latent
+    end_latent = hunyuan.vae_encode(end_image_tensor, vae).cpu() if end_image_tensor is not None else None
+    control_latents = None
+    if control_image_tensors is not None:
+        control_latents = []
+        for ctrl_image_tensor in control_image_tensors:
+            control_latent = hunyuan.vae_encode(ctrl_image_tensor, vae).cpu()
+            control_latents.append(control_latent)
+    vae.to("cpu")  # move VAE to CPU to save memory
+    clean_memory_on_device(device)
+    # prepare model input arguments
+    arg_c = {}
+    arg_null = {}
+    for index in llama_vecs.keys():
+        llama_vec = llama_vecs[index]
+        llama_attention_mask = llama_attention_masks[index]
+        clip_l_pooler = clip_l_poolers[index]
+        arg_c_i = {
+            "llama_vec": llama_vec,
+            "llama_attention_mask": llama_attention_mask,
+            "clip_l_pooler": clip_l_pooler,
+            "prompt": section_prompts[index],  # for debugging
+        }
+        arg_c[index] = arg_c_i
+    arg_null = {
+        "llama_vec": llama_vec_n,
+        "llama_attention_mask": llama_attention_mask_n,
+        "clip_l_pooler": clip_l_pooler_n,
+    }
+    arg_c_img = {}
+    for index in section_images.keys():
+        image_encoder_last_hidden_state = section_image_encoder_last_hidden_states[index]
+        start_latent = section_start_latents[index]
+        arg_c_img_i = {
+            "image_encoder_last_hidden_state": image_encoder_last_hidden_state,
+            "start_latent": start_latent,
+            "image_path": section_image_paths[index],
+        }
+        arg_c_img[index] = arg_c_img_i
+    return height, width, video_seconds, arg_c, arg_null, arg_c_img, end_latent, control_latents, control_mask_images
+# def setup_scheduler(args: argparse.Namespace, config, device: torch.device) -> Tuple[Any, torch.Tensor]:
+#     """setup scheduler for sampling
+#     Args:
+#         args: command line arguments
+#         config: model configuration
+#         device: device to use
+#     Returns:
+#         Tuple[Any, torch.Tensor]: (scheduler, timesteps)
+#     """
+#     if args.sample_solver == "unipc":
+#         scheduler = FlowUniPCMultistepScheduler(num_train_timesteps=config.num_train_timesteps, shift=1, use_dynamic_shifting=False)
+#         scheduler.set_timesteps(args.infer_steps, device=device, shift=args.flow_shift)
+#         timesteps = scheduler.timesteps
+#     elif args.sample_solver == "dpm++":
+#         scheduler = FlowDPMSolverMultistepScheduler(
+#             num_train_timesteps=config.num_train_timesteps, shift=1, use_dynamic_shifting=False
+#         )
+#         sampling_sigmas = get_sampling_sigmas(args.infer_steps, args.flow_shift)
+#         timesteps, _ = retrieve_timesteps(scheduler, device=device, sigmas=sampling_sigmas)
+#     elif args.sample_solver == "vanilla":
+#         scheduler = FlowMatchDiscreteScheduler(num_train_timesteps=config.num_train_timesteps, shift=args.flow_shift)
+#         scheduler.set_timesteps(args.infer_steps, device=device)
+#         timesteps = scheduler.timesteps
+#         # FlowMatchDiscreteScheduler does not support generator argument in step method
+#         org_step = scheduler.step
+#         def step_wrapper(
+#             model_output: torch.Tensor,
+#             timestep: Union[int, torch.Tensor],
+#             sample: torch.Tensor,
+#             return_dict: bool = True,
+#             generator=None,
+#         ):
+#             return org_step(model_output, timestep, sample, return_dict=return_dict)
+#         scheduler.step = step_wrapper
+#     else:
+#         raise NotImplementedError("Unsupported solver.")
+#     return scheduler, timesteps
+def convert_lora_for_framepack(lora_sd: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    # Check the format of the LoRA file
+    keys = list(lora_sd.keys())
+    if keys[0].startswith("lora_unet_"):
+        # logging.info(f"Musubi Tuner LoRA detected")
+        pass
+    else:
+        transformer_prefixes = ["diffusion_model", "transformer"]  # to ignore Text Encoder modules
+        lora_suffix = None
+        prefix = None
+        for key in keys:
+            if lora_suffix is None and "lora_A" in key:
+                lora_suffix = "lora_A"
+            if prefix is None:
+                pfx = key.split(".")[0]
+                if pfx in transformer_prefixes:
+                    prefix = pfx
+            if lora_suffix is not None and prefix is not None:
+                break
+        if lora_suffix == "lora_A" and prefix is not None:
+            logging.info(f"Diffusion-pipe (?) LoRA detected, converting to the default LoRA format")
+            lora_sd = convert_lora_from_diffusion_pipe_or_something(lora_sd, "lora_unet_")
+        else:
+            logging.info(f"LoRA file format not recognized. Using it as-is.")
+    # Check LoRA is for FramePack or for HunyuanVideo
+    is_hunyuan = False
+    for key in lora_sd.keys():
+        if "double_blocks" in key or "single_blocks" in key:
+            is_hunyuan = True
+            break
+    if is_hunyuan:
+        logging.info("HunyuanVideo LoRA detected, converting to FramePack format")
+        lora_sd = convert_hunyuan_to_framepack(lora_sd)
+    return lora_sd
+def convert_lora_from_diffusion_pipe_or_something(lora_sd: dict[str, torch.Tensor], prefix: str) -> dict[str, torch.Tensor]:
+    """
+    Convert LoRA weights to the format used by the diffusion pipeline to Musubi Tuner.
+    Copy from Musubi Tuner repo.
+    """
+    # convert from diffusers(?) to default LoRA
+    # Diffusers format: {"diffusion_model.module.name.lora_A.weight": weight, "diffusion_model.module.name.lora_B.weight": weight, ...}
+    # default LoRA format: {"prefix_module_name.lora_down.weight": weight, "prefix_module_name.lora_up.weight": weight, ...}
+    # note: Diffusers has no alpha, so alpha is set to rank
+    new_weights_sd = {}
+    lora_dims = {}
+    for key, weight in lora_sd.items():
+        diffusers_prefix, key_body = key.split(".", 1)
+        if diffusers_prefix != "diffusion_model" and diffusers_prefix != "transformer":
+            print(f"unexpected key: {key} in diffusers format")
+            continue
+        new_key = f"{prefix}{key_body}".replace(".", "_").replace("_lora_A_", ".lora_down.").replace("_lora_B_", ".lora_up.")
+        new_weights_sd[new_key] = weight
+        lora_name = new_key.split(".")[0]  # before first dot
+        if lora_name not in lora_dims and "lora_down" in new_key:
+            lora_dims[lora_name] = weight.shape[0]
+    # add alpha with rank
+    for lora_name, dim in lora_dims.items():
+        new_weights_sd[f"{lora_name}.alpha"] = torch.tensor(dim)
+    return new_weights_sd
+def convert_hunyuan_to_framepack(lora_sd: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    """
+    Convert HunyuanVideo LoRA weights to FramePack format.
+    """
+    new_lora_sd = {}
+    for key, weight in lora_sd.items():
+        if "double_blocks" in key:
+            key = key.replace("double_blocks", "transformer_blocks")
+            key = key.replace("img_mod_linear", "norm1_linear")
+            key = key.replace("img_attn_qkv", "attn_to_QKV")  # split later
+            key = key.replace("img_attn_proj", "attn_to_out_0")
+            key = key.replace("img_mlp_fc1", "ff_net_0_proj")
+            key = key.replace("img_mlp_fc2", "ff_net_2")
+            key = key.replace("txt_mod_linear", "norm1_context_linear")
+            key = key.replace("txt_attn_qkv", "attn_add_QKV_proj")  # split later
+            key = key.replace("txt_attn_proj", "attn_to_add_out")
+            key = key.replace("txt_mlp_fc1", "ff_context_net_0_proj")
+            key = key.replace("txt_mlp_fc2", "ff_context_net_2")
+        elif "single_blocks" in key:
+            key = key.replace("single_blocks", "single_transformer_blocks")
+            key = key.replace("linear1", "attn_to_QKVM")  # split later
+            key = key.replace("linear2", "proj_out")
+            key = key.replace("modulation_linear", "norm_linear")
+        else:
+            print(f"Unsupported module name: {key}, only double_blocks and single_blocks are supported")
+            continue
+        if "QKVM" in key:
+            # split QKVM into Q, K, V, M
+            key_q = key.replace("QKVM", "q")
+            key_k = key.replace("QKVM", "k")
+            key_v = key.replace("QKVM", "v")
+            key_m = key.replace("attn_to_QKVM", "proj_mlp")
+            if "_down" in key or "alpha" in key:
+                # copy QKVM weight or alpha to Q, K, V, M
+                assert "alpha" in key or weight.size(1) == 3072, f"QKVM weight size mismatch: {key}. {weight.size()}"
+                new_lora_sd[key_q] = weight
+                new_lora_sd[key_k] = weight
+                new_lora_sd[key_v] = weight
+                new_lora_sd[key_m] = weight
+            elif "_up" in key:
+                # split QKVM weight into Q, K, V, M
+                assert weight.size(0) == 21504, f"QKVM weight size mismatch: {key}. {weight.size()}"
+                new_lora_sd[key_q] = weight[:3072]
+                new_lora_sd[key_k] = weight[3072 : 3072 * 2]
+                new_lora_sd[key_v] = weight[3072 * 2 : 3072 * 3]
+                new_lora_sd[key_m] = weight[3072 * 3 :]  # 21504 - 3072 * 3 = 12288
+            else:
+                print(f"Unsupported module name: {key}")
+                continue
+        elif "QKV" in key:
+            # split QKV into Q, K, V
+            key_q = key.replace("QKV", "q")
+            key_k = key.replace("QKV", "k")
+            key_v = key.replace("QKV", "v")
+            if "_down" in key or "alpha" in key:
+                # copy QKV weight or alpha to Q, K, V
+                assert "alpha" in key or weight.size(1) == 3072, f"QKV weight size mismatch: {key}. {weight.size()}"
+                new_lora_sd[key_q] = weight
+                new_lora_sd[key_k] = weight
+                new_lora_sd[key_v] = weight
+            elif "_up" in key:
+                # split QKV weight into Q, K, V
+                assert weight.size(0) == 3072 * 3, f"QKV weight size mismatch: {key}. {weight.size()}"
+                new_lora_sd[key_q] = weight[:3072]
+                new_lora_sd[key_k] = weight[3072 : 3072 * 2]
+                new_lora_sd[key_v] = weight[3072 * 2 :]
+            else:
+                print(f"Unsupported module name: {key}")
+                continue
+        else:
+            # no split needed
+            new_lora_sd[key] = weight
+    return new_lora_sd
+def generate(
+    args: argparse.Namespace, gen_settings: GenerationSettings, shared_models: Optional[Dict] = None
+) -> tuple[AutoencoderKLCausal3D, torch.Tensor]:
+    """main function for generation
+    Args:
+        args: command line arguments
+        shared_models: dictionary containing pre-loaded models
+    Returns:
+        tuple: (AutoencoderKLCausal3D model (vae), torch.Tensor generated latent)
+    """
+    device, dit_weight_dtype = (gen_settings.device, gen_settings.dit_weight_dtype)
+    # prepare seed
+    seed = args.seed if args.seed is not None else random.randint(0, 2**32 - 1)
+    args.seed = seed  # set seed to args for saving
+    # Check if we have shared models
+    if shared_models is not None:
+        # Use shared models and encoded data
+        vae = shared_models.get("vae")
+        height, width, video_seconds, context, context_null, context_img, end_latent, control_latents, control_mask_images = (
+            prepare_i2v_inputs(args, device, vae, shared_models)
+        )
+    else:
+        # prepare inputs without shared models
+        vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device)
+        height, width, video_seconds, context, context_null, context_img, end_latent, control_latents, control_mask_images = (
+            prepare_i2v_inputs(args, device, vae)
+        )
+    if shared_models is None or "model" not in shared_models:
+        # load DiT model
+        model = load_dit_model(args, device)
+        # merge LoRA weights
+        if args.lora_weight is not None and len(args.lora_weight) > 0:
+            # ugly hack to common merge_lora_weights function
+            merge_lora_weights(lora_framepack, model, args, device, convert_lora_for_framepack)
+            # if we only want to save the model, we can skip the rest
+            if args.save_merged_model:
+                return None, None
+        # optimize model: fp8 conversion, block swap etc.
+        optimize_model(model, args, device)
+        if shared_models is not None:
+            shared_models["model"] = model
+    else:
+        # use shared model
+        model: HunyuanVideoTransformer3DModelPacked = shared_models["model"]
+        model.move_to_device_except_swap_blocks(device)
+        model.prepare_block_swap_before_forward()
+    # sampling
+    latent_window_size = args.latent_window_size  # default is 9
+    # ex: (5s * 30fps) / (9 * 4) = 4.16 -> 4 sections, 60s -> 1800 / 36 = 50 sections
+    total_latent_sections = (video_seconds * 30) / (latent_window_size * 4)
+    total_latent_sections = int(max(round(total_latent_sections), 1))
+    # set random generator
+    seed_g = torch.Generator(device="cpu")
+    seed_g.manual_seed(seed)
+    num_frames = latent_window_size * 4 - 3
+    logger.info(
+        f"Video size: {height}x{width}@{video_seconds} (HxW@seconds), fps: {args.fps}, num sections: {total_latent_sections}, "
+        f"infer_steps: {args.infer_steps}, frames per generation: {num_frames}"
+    )
+    # video generation ######
+    f1_mode = args.f1
+    one_frame_inference = None
+    if args.one_frame_inference is not None:
+        one_frame_inference = set()
+        for mode in args.one_frame_inference.split(","):
+            one_frame_inference.add(mode.strip())
+    if one_frame_inference is not None:
+        real_history_latents = generate_with_one_frame_inference(
+            args,
+            model,
+            context,
+            context_null,
+            context_img,
+            control_latents,
+            control_mask_images,
+            latent_window_size,
+            height,
+            width,
+            device,
+            seed_g,
+            one_frame_inference,
+        )
+    else:
+        # prepare history latents
+        history_latents = torch.zeros((1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32)
+        if end_latent is not None and not f1_mode:
+            logger.info(f"Use end image(s): {args.end_image_path}")
+            history_latents[:, :, :1] = end_latent.to(history_latents)
+        # prepare clean latents and indices
+        if not f1_mode:
+            # Inverted Anti-drifting
+            total_generated_latent_frames = 0
+            latent_paddings = reversed(range(total_latent_sections))
+            if total_latent_sections > 4 and one_frame_inference is None:
+                # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
+                # items looks better than expanding it when total_latent_sections > 4
+                # One can try to remove below trick and just
+                # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
+                # 4 sections: 3, 2, 1, 0. 50 sections: 3, 2, 2, ... 2, 1, 0
+                latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
+            if args.latent_paddings is not None:
+                # parse user defined latent paddings
+                user_latent_paddings = [int(x) for x in args.latent_paddings.split(",")]
+                if len(user_latent_paddings) < total_latent_sections:
+                    print(
+                        f"User defined latent paddings length {len(user_latent_paddings)} does not match total sections {total_latent_sections}."
+                    )
+                    print(f"Use default paddings instead for unspecified sections.")
+                    latent_paddings[: len(user_latent_paddings)] = user_latent_paddings
+                elif len(user_latent_paddings) > total_latent_sections:
+                    print(
+                        f"User defined latent paddings length {len(user_latent_paddings)} is greater than total sections {total_latent_sections}."
+                    )
+                    print(f"Use only first {total_latent_sections} paddings instead.")
+                    latent_paddings = user_latent_paddings[:total_latent_sections]
+                else:
+                    latent_paddings = user_latent_paddings
+        else:
+            start_latent = context_img[0]["start_latent"]
+            history_latents = torch.cat([history_latents, start_latent], dim=2)
+            total_generated_latent_frames = 1  # a bit hacky, but we employ the same logic as in official code
+            latent_paddings = [0] * total_latent_sections  # dummy paddings for F1 mode
+        latent_paddings = list(latent_paddings)  # make sure it's a list
+        for loop_index in range(total_latent_sections):
+            latent_padding = latent_paddings[loop_index]
+            if not f1_mode:
+                # Inverted Anti-drifting
+                section_index_reverse = loop_index  # 0, 1, 2, 3
+                section_index = total_latent_sections - 1 - section_index_reverse  # 3, 2, 1, 0
+                section_index_from_last = -(section_index_reverse + 1)  # -1, -2, -3, -4
+                is_last_section = section_index == 0
+                is_first_section = section_index_reverse == 0
+                latent_padding_size = latent_padding * latent_window_size
+                logger.info(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}")
+            else:
+                section_index = loop_index  # 0, 1, 2, 3
+                section_index_from_last = section_index - total_latent_sections  # -4, -3, -2, -1
+                is_last_section = loop_index == total_latent_sections - 1
+                is_first_section = loop_index == 0
+                latent_padding_size = 0  # dummy padding for F1 mode
+            # select start latent
+            if section_index_from_last in context_img:
+                image_index = section_index_from_last
+            elif section_index in context_img:
+                image_index = section_index
+            else:
+                image_index = 0
+            start_latent = context_img[image_index]["start_latent"]
+            image_path = context_img[image_index]["image_path"]
+            if image_index != 0:  # use section image other than section 0
+                logger.info(
+                    f"Apply experimental section image, latent_padding_size = {latent_padding_size}, image_path = {image_path}"
+                )
+            if not f1_mode:
+                # Inverted Anti-drifting
+                indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
+                (
+                    clean_latent_indices_pre,
+                    blank_indices,
+                    latent_indices,
+                    clean_latent_indices_post,
+                    clean_latent_2x_indices,
+                    clean_latent_4x_indices,
+                ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
+                clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+                clean_latents_pre = start_latent.to(history_latents)
+                clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split(
+                    [1, 2, 16], dim=2
+                )
+                clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
+            else:
+                # F1 mode
+                indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
+                (
+                    clean_latent_indices_start,
+                    clean_latent_4x_indices,
+                    clean_latent_2x_indices,
+                    clean_latent_1x_indices,
+                    latent_indices,
+                ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
+                clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
+                clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]) :, :, :].split(
+                    [16, 2, 1], dim=2
+                )
+                clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
+            # if use_teacache:
+            #     transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
+            # else:
+            #     transformer.initialize_teacache(enable_teacache=False)
+            # prepare conditioning inputs
+            if section_index_from_last in context:
+                prompt_index = section_index_from_last
+            elif section_index in context:
+                prompt_index = section_index
+            else:
+                prompt_index = 0
+            context_for_index = context[prompt_index]
+            # if args.section_prompts is not None:
+            logger.info(f"Section {section_index}: {context_for_index['prompt']}")
+            llama_vec = context_for_index["llama_vec"].to(device, dtype=torch.bfloat16)
+            llama_attention_mask = context_for_index["llama_attention_mask"].to(device)
+            clip_l_pooler = context_for_index["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+            image_encoder_last_hidden_state = context_img[image_index]["image_encoder_last_hidden_state"].to(
+                device, dtype=torch.bfloat16
+            )
+            llama_vec_n = context_null["llama_vec"].to(device, dtype=torch.bfloat16)
+            llama_attention_mask_n = context_null["llama_attention_mask"].to(device)
+            clip_l_pooler_n = context_null["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+            generated_latents = sample_hunyuan(
+                transformer=model,
+                sampler=args.sample_solver,
+                width=width,
+                height=height,
+                frames=num_frames,
+                real_guidance_scale=args.guidance_scale,
+                distilled_guidance_scale=args.embedded_cfg_scale,
+                guidance_rescale=args.guidance_rescale,
+                # shift=3.0,
+                num_inference_steps=args.infer_steps,
+                generator=seed_g,
+                prompt_embeds=llama_vec,
+                prompt_embeds_mask=llama_attention_mask,
+                prompt_poolers=clip_l_pooler,
+                negative_prompt_embeds=llama_vec_n,
+                negative_prompt_embeds_mask=llama_attention_mask_n,
+                negative_prompt_poolers=clip_l_pooler_n,
+                device=device,
+                dtype=torch.bfloat16,
+                image_embeddings=image_encoder_last_hidden_state,
+                latent_indices=latent_indices,
+                clean_latents=clean_latents,
+                clean_latent_indices=clean_latent_indices,
+                clean_latents_2x=clean_latents_2x,
+                clean_latent_2x_indices=clean_latent_2x_indices,
+                clean_latents_4x=clean_latents_4x,
+                clean_latent_4x_indices=clean_latent_4x_indices,
+            )
+            # concatenate generated latents
+            total_generated_latent_frames += int(generated_latents.shape[2])
+            if not f1_mode:
+                # Inverted Anti-drifting: prepend generated latents to history latents
+                if is_last_section:
+                    generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
+                    total_generated_latent_frames += 1
+                history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
+                real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
+            else:
+                # F1 mode: append generated latents to history latents
+                history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
+                real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
+            logger.info(f"Generated. Latent shape {real_history_latents.shape}")
+            # # TODO support saving intermediate video
+            # clean_memory_on_device(device)
+            # vae.to(device)
+            # if history_pixels is None:
+            #     history_pixels = hunyuan.vae_decode(real_history_latents, vae).cpu()
+            # else:
+            #     section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
+            #     overlapped_frames = latent_window_size * 4 - 3
+            #     current_pixels = hunyuan.vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
+            #     history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
+            # vae.to("cpu")
+            # # if not is_last_section:
+            # #     # save intermediate video
+            # #     save_video(history_pixels[0], args, total_generated_latent_frames)
+            # print(f"Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}")
+    # Only clean up shared models if they were created within this function
+    if shared_models is None:
+        del model  # free memory
+        synchronize_device(device)
+    else:
+        # move model to CPU to save memory
+        model.to("cpu")
+    # wait for 5 seconds until block swap is done
+    if args.blocks_to_swap > 0:
+        logger.info("Waiting for 5 seconds to finish block swap")
+        time.sleep(5)
+    gc.collect()
+    clean_memory_on_device(device)
+    return vae, real_history_latents
+def generate_with_one_frame_inference(
+    args: argparse.Namespace,
+    model: HunyuanVideoTransformer3DModelPacked,
+    context: Dict[int, Dict[str, torch.Tensor]],
+    context_null: Dict[str, torch.Tensor],
+    context_img: Dict[int, Dict[str, torch.Tensor]],
+    control_latents: Optional[List[torch.Tensor]],
+    control_mask_images: Optional[List[Optional[Image.Image]]],
+    latent_window_size: int,
+    height: int,
+    width: int,
+    device: torch.device,
+    seed_g: torch.Generator,
+    one_frame_inference: set[str],
+) -> torch.Tensor:
+    # one frame inference
+    sample_num_frames = 1
+    latent_indices = torch.zeros((1, 1), dtype=torch.int64)  # 1x1 latent index for target image
+    latent_indices[:, 0] = latent_window_size  # last of latent_window
+    def get_latent_mask(mask_image: Image.Image) -> torch.Tensor:
+        if mask_image.mode != "L":
+            mask_image = mask_image.convert("L")
+        mask_image = mask_image.resize((width // 8, height // 8), Image.LANCZOS)
+        mask_image = np.array(mask_image)  # PIL to numpy, HWC
+        mask_image = torch.from_numpy(mask_image).float() / 255.0  # 0 to 1.0, HWC
+        mask_image = mask_image.squeeze(-1)  # HWC -> HW
+        mask_image = mask_image.unsqueeze(0).unsqueeze(0).unsqueeze(0)  # HW -> 111HW (BCFHW)
+        mask_image = mask_image.to(torch.float32)
+        return mask_image
+    if control_latents is None or len(control_latents) == 0:
+        logger.info(f"No control images provided for one frame inference. Use zero latents for control images.")
+        control_latents = [torch.zeros(1, 16, 1, height // 8, width // 8, dtype=torch.float32)]
+    if "no_post" not in one_frame_inference:
+        # add zero latents as clean latents post
+        control_latents.append(torch.zeros((1, 16, 1, height // 8, width // 8), dtype=torch.float32))
+        logger.info(f"Add zero latents as clean latents post for one frame inference.")
+    # kisekaeichi and 1f-mc: both are using control images, but indices are different
+    clean_latents = torch.cat(control_latents, dim=2)  # (1, 16, num_control_images, H//8, W//8)
+    clean_latent_indices = torch.zeros((1, len(control_latents)), dtype=torch.int64)
+    if "no_post" not in one_frame_inference:
+        clean_latent_indices[:, -1] = 1 + latent_window_size  # default index for clean latents post
+    for i in range(len(control_latents)):
+        mask_image = None
+        if args.control_image_mask_path is not None and i < len(args.control_image_mask_path):
+            mask_image = get_latent_mask(Image.open(args.control_image_mask_path[i]))
+            logger.info(
+                f"Apply mask for clean latents 1x for {i + 1}: {args.control_image_mask_path[i]}, shape: {mask_image.shape}"
+            )
+        elif control_mask_images is not None and i < len(control_mask_images) and control_mask_images[i] is not None:
+            mask_image = get_latent_mask(control_mask_images[i])
+            logger.info(f"Apply mask for clean latents 1x for {i + 1} with alpha channel: {mask_image.shape}")
+        if mask_image is not None:
+            clean_latents[:, :, i : i + 1, :, :] = clean_latents[:, :, i : i + 1, :, :] * mask_image
+    for one_frame_param in one_frame_inference:
+        if one_frame_param.startswith("target_index="):
+            target_index = int(one_frame_param.split("=")[1])
+            latent_indices[:, 0] = target_index
+            logger.info(f"Set index for target: {target_index}")
+        elif one_frame_param.startswith("control_index="):
+            control_indices = one_frame_param.split("=")[1].split(";")
+            i = 0
+            while i < len(control_indices) and i < clean_latent_indices.shape[1]:
+                control_index = int(control_indices[i])
+                clean_latent_indices[:, i] = control_index
+                i += 1
+            logger.info(f"Set index for clean latent 1x: {control_indices}")
+    # "default" option does nothing, so we can skip it
+    if "default" in one_frame_inference:
+        pass
+    if "no_2x" in one_frame_inference:
+        clean_latents_2x = None
+        clean_latent_2x_indices = None
+        logger.info(f"No clean_latents_2x")
+    else:
+        clean_latents_2x = torch.zeros((1, 16, 2, height // 8, width // 8), dtype=torch.float32)
+        index = 1 + latent_window_size + 1
+        clean_latent_2x_indices = torch.arange(index, index + 2).unsqueeze(0)  #  2
+    if "no_4x" in one_frame_inference:
+        clean_latents_4x = None
+        clean_latent_4x_indices = None
+        logger.info(f"No clean_latents_4x")
+    else:
+        clean_latents_4x = torch.zeros((1, 16, 16, height // 8, width // 8), dtype=torch.float32)
+        index = 1 + latent_window_size + 1 + 2
+        clean_latent_4x_indices = torch.arange(index, index + 16).unsqueeze(0)  #  16
+    logger.info(
+        f"One frame inference. clean_latent: {clean_latents.shape} latent_indices: {latent_indices}, clean_latent_indices: {clean_latent_indices}, num_frames: {sample_num_frames}"
+    )
+    # prepare conditioning inputs
+    prompt_index = 0
+    image_index = 0
+    context_for_index = context[prompt_index]
+    logger.info(f"Prompt: {context_for_index['prompt']}")
+    llama_vec = context_for_index["llama_vec"].to(device, dtype=torch.bfloat16)
+    llama_attention_mask = context_for_index["llama_attention_mask"].to(device)
+    clip_l_pooler = context_for_index["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+    image_encoder_last_hidden_state = context_img[image_index]["image_encoder_last_hidden_state"].to(device, dtype=torch.bfloat16)
+    llama_vec_n = context_null["llama_vec"].to(device, dtype=torch.bfloat16)
+    llama_attention_mask_n = context_null["llama_attention_mask"].to(device)
+    clip_l_pooler_n = context_null["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+    generated_latents = sample_hunyuan(
+        transformer=model,
+        sampler=args.sample_solver,
+        width=width,
+        height=height,
+        frames=1,
+        real_guidance_scale=args.guidance_scale,
+        distilled_guidance_scale=args.embedded_cfg_scale,
+        guidance_rescale=args.guidance_rescale,
+        # shift=3.0,
+        num_inference_steps=args.infer_steps,
+        generator=seed_g,
+        prompt_embeds=llama_vec,
+        prompt_embeds_mask=llama_attention_mask,
+        prompt_poolers=clip_l_pooler,
+        negative_prompt_embeds=llama_vec_n,
+        negative_prompt_embeds_mask=llama_attention_mask_n,
+        negative_prompt_poolers=clip_l_pooler_n,
+        device=device,
+        dtype=torch.bfloat16,
+        image_embeddings=image_encoder_last_hidden_state,
+        latent_indices=latent_indices,
+        clean_latents=clean_latents,
+        clean_latent_indices=clean_latent_indices,
+        clean_latents_2x=clean_latents_2x,
+        clean_latent_2x_indices=clean_latent_2x_indices,
+        clean_latents_4x=clean_latents_4x,
+        clean_latent_4x_indices=clean_latent_4x_indices,
+    )
+    real_history_latents = generated_latents.to(clean_latents)
+    return real_history_latents
+def save_latent(latent: torch.Tensor, args: argparse.Namespace, height: int, width: int) -> str:
+    """Save latent to file
+    Args:
+        latent: Latent tensor
+        args: command line arguments
+        height: height of frame
+        width: width of frame
+    Returns:
+        str: Path to saved latent file
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+    seed = args.seed
+    video_seconds = args.video_seconds
+    latent_path = f"{save_path}/{time_flag}_{seed}_latent.safetensors"
+    if args.no_metadata:
+        metadata = None
+    else:
+        metadata = {
+            "seeds": f"{seed}",
+            "prompt": f"{args.prompt}",
+            "height": f"{height}",
+            "width": f"{width}",
+            "video_seconds": f"{video_seconds}",
+            "infer_steps": f"{args.infer_steps}",
+            "guidance_scale": f"{args.guidance_scale}",
+            "latent_window_size": f"{args.latent_window_size}",
+            "embedded_cfg_scale": f"{args.embedded_cfg_scale}",
+            "guidance_rescale": f"{args.guidance_rescale}",
+            "sample_solver": f"{args.sample_solver}",
+            "latent_window_size": f"{args.latent_window_size}",
+            "fps": f"{args.fps}",
+        }
+        if args.negative_prompt is not None:
+            metadata["negative_prompt"] = f"{args.negative_prompt}"
+    sd = {"latent": latent.contiguous()}
+    save_file(sd, latent_path, metadata=metadata)
+    logger.info(f"Latent saved to: {latent_path}")
+    return latent_path
+def save_video(
+    video: torch.Tensor, args: argparse.Namespace, original_base_name: Optional[str] = None, latent_frames: Optional[int] = None
+) -> str:
+    """Save video to file
+    Args:
+        video: Video tensor
+        args: command line arguments
+        original_base_name: Original base name (if latents are loaded from files)
+    Returns:
+        str: Path to saved video file
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+    seed = args.seed
+    original_name = "" if original_base_name is None else f"_{original_base_name}"
+    latent_frames = "" if latent_frames is None else f"_{latent_frames}"
+    video_path = f"{save_path}/{time_flag}_{seed}{original_name}{latent_frames}.mp4"
+    video = video.unsqueeze(0)
+    save_videos_grid(video, video_path, fps=args.fps, rescale=True)
+    logger.info(f"Video saved to: {video_path}")
+    return video_path
+def save_images(sample: torch.Tensor, args: argparse.Namespace, original_base_name: Optional[str] = None) -> str:
+    """Save images to directory
+    Args:
+        sample: Video tensor
+        args: command line arguments
+        original_base_name: Original base name (if latents are loaded from files)
+    Returns:
+        str: Path to saved images directory
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S")
+    seed = args.seed
+    original_name = "" if original_base_name is None else f"_{original_base_name}"
+    image_name = f"{time_flag}_{seed}{original_name}"
+    sample = sample.unsqueeze(0)
+    one_frame_mode = args.one_frame_inference is not None
+    save_images_grid(sample, save_path, image_name, rescale=True, create_subdir=not one_frame_mode)
+    logger.info(f"Sample images saved to: {save_path}/{image_name}")
+    return f"{save_path}/{image_name}"
+def save_output(
+    args: argparse.Namespace,
+    vae: AutoencoderKLCausal3D,
+    latent: torch.Tensor,
+    device: torch.device,
+    original_base_names: Optional[List[str]] = None,
+) -> None:
+    """save output
+    Args:
+        args: command line arguments
+        vae: VAE model
+        latent: latent tensor
+        device: device to use
+        original_base_names: original base names (if latents are loaded from files)
+    """
+    height, width = latent.shape[-2], latent.shape[-1]  # BCTHW
+    height *= 8
+    width *= 8
+    # print(f"Saving output. Latent shape {latent.shape}; pixel shape {height}x{width}")
+    if args.output_type == "latent" or args.output_type == "both" or args.output_type == "latent_images":
+        # save latent
+        save_latent(latent, args, height, width)
+    if args.output_type == "latent":
+        return
+    total_latent_sections = (args.video_seconds * 30) / (args.latent_window_size * 4)
+    total_latent_sections = int(max(round(total_latent_sections), 1))
+    video = decode_latent(
+        args.latent_window_size, total_latent_sections, args.bulk_decode, vae, latent, device, args.one_frame_inference is not None
+    )
+    if args.output_type == "video" or args.output_type == "both":
+        # save video
+        original_name = "" if original_base_names is None else f"_{original_base_names[0]}"
+        save_video(video, args, original_name)
+    elif args.output_type == "images" or args.output_type == "latent_images":
+        # save images
+        original_name = "" if original_base_names is None else f"_{original_base_names[0]}"
+        save_images(video, args, original_name)
+def preprocess_prompts_for_batch(prompt_lines: List[str], base_args: argparse.Namespace) -> List[Dict]:
+    """Process multiple prompts for batch mode
+    Args:
+        prompt_lines: List of prompt lines
+        base_args: Base command line arguments
+    Returns:
+        List[Dict]: List of prompt data dictionaries
+    """
+    prompts_data = []
+    for line in prompt_lines:
+        line = line.strip()
+        if not line or line.startswith("#"):  # Skip empty lines and comments
+            continue
+        # Parse prompt line and create override dictionary
+        prompt_data = parse_prompt_line(line)
+        logger.info(f"Parsed prompt data: {prompt_data}")
+        prompts_data.append(prompt_data)
+    return prompts_data
+def load_shared_models(args: argparse.Namespace) -> Dict:
+    """Load shared models for batch processing or interactive mode.
+    Models are loaded to CPU to save memory.
+    Args:
+        args: Base command line arguments
+    Returns:
+        Dict: Dictionary of shared models
+    """
+    shared_models = {}
+    tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, "cpu")
+    tokenizer2, text_encoder2 = load_text_encoder2(args)
+    feature_extractor, image_encoder = load_image_encoders(args)
+    vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, "cpu")
+    shared_models["tokenizer1"] = tokenizer1
+    shared_models["text_encoder1"] = text_encoder1
+    shared_models["tokenizer2"] = tokenizer2
+    shared_models["text_encoder2"] = text_encoder2
+    shared_models["feature_extractor"] = feature_extractor
+    shared_models["image_encoder"] = image_encoder
+    shared_models["vae"] = vae
+    return shared_models
+def process_batch_prompts(prompts_data: List[Dict], args: argparse.Namespace) -> None:
+    """Process multiple prompts with model reuse
+    Args:
+        prompts_data: List of prompt data dictionaries
+        args: Base command line arguments
+    """
+    if not prompts_data:
+        logger.warning("No valid prompts found")
+        return
+    # 1. Load configuration
+    gen_settings = get_generation_settings(args)
+    device = gen_settings.device
+    # 2. Load models to CPU in advance except for VAE and DiT
+    shared_models = load_shared_models(args)
+    # 3. Generate for each prompt
+    all_latents = []
+    all_prompt_args = []
+    with torch.no_grad():
+        for prompt_data in prompts_data:
+            prompt = prompt_data["prompt"]
+            prompt_args = apply_overrides(args, prompt_data)
+            logger.info(f"Processing prompt: {prompt}")
+            try:
+                vae, latent = generate(prompt_args, gen_settings, shared_models)
+                # Save latent if needed
+                if args.output_type == "latent" or args.output_type == "both" or args.output_type == "latent_images":
+                    height, width = latent.shape[-2], latent.shape[-1]  # BCTHW
+                    height *= 8
+                    width *= 8
+                    save_latent(latent, prompt_args, height, width)
+                all_latents.append(latent)
+                all_prompt_args.append(prompt_args)
+            except Exception as e:
+                logger.error(f"Error processing prompt: {prompt}. Error: {e}")
+                continue
+    # 4. Free models
+    if "model" in shared_models:
+        del shared_models["model"]
+    del shared_models["tokenizer1"]
+    del shared_models["text_encoder1"]
+    del shared_models["tokenizer2"]
+    del shared_models["text_encoder2"]
+    del shared_models["feature_extractor"]
+    del shared_models["image_encoder"]
+    clean_memory_on_device(device)
+    synchronize_device(device)
+    # 5. Decode latents if needed
+    if args.output_type != "latent":
+        logger.info("Decoding latents to videos/images")
+        vae.to(device)
+        for i, (latent, prompt_args) in enumerate(zip(all_latents, all_prompt_args)):
+            logger.info(f"Decoding output {i+1}/{len(all_latents)}")
+            # avoid saving latents again (ugly hack)
+            if prompt_args.output_type == "both":
+                prompt_args.output_type = "video"
+            elif prompt_args.output_type == "latent_images":
+                prompt_args.output_type = "images"
+            save_output(prompt_args, vae, latent[0], device)
+def process_interactive(args: argparse.Namespace) -> None:
+    """Process prompts in interactive mode
+    Args:
+        args: Base command line arguments
+    """
+    gen_settings = get_generation_settings(args)
+    device = gen_settings.device
+    shared_models = load_shared_models(args)
+    print("Interactive mode. Enter prompts (Ctrl+D or Ctrl+Z (Windows) to exit):")
+    try:
+        while True:
+            try:
+                line = input("> ")
+                if not line.strip():
+                    continue
+                # Parse prompt
+                prompt_data = parse_prompt_line(line)
+                prompt_args = apply_overrides(args, prompt_data)
+                # Generate latent
+                vae, latent = generate(prompt_args, gen_settings, shared_models)
+                # Save latent and video
+                save_output(prompt_args, vae, latent[0], device)
+            except KeyboardInterrupt:
+                print("\nInterrupted. Continue (Ctrl+D or Ctrl+Z (Windows) to exit)")
+                continue
+    except EOFError:
+        print("\nExiting interactive mode")
+def get_generation_settings(args: argparse.Namespace) -> GenerationSettings:
+    device = torch.device(args.device)
+    dit_weight_dtype = None  # default
+    if args.fp8_scaled:
+        dit_weight_dtype = None  # various precision weights, so don't cast to specific dtype
+    elif args.fp8:
+        dit_weight_dtype = torch.float8_e4m3fn
+    logger.info(f"Using device: {device}, DiT weight weight precision: {dit_weight_dtype}")
+    gen_settings = GenerationSettings(device=device, dit_weight_dtype=dit_weight_dtype)
+    return gen_settings
+def main():
+    # Parse arguments
+    args = parse_args()
+    # Check if latents are provided
+    latents_mode = args.latent_path is not None and len(args.latent_path) > 0
+    # Set device
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    logger.info(f"Using device: {device}")
+    args.device = device
+    if latents_mode:
+        # Original latent decode mode
+        original_base_names = []
+        latents_list = []
+        seeds = []
+        # assert len(args.latent_path) == 1, "Only one latent path is supported for now"
+        for latent_path in args.latent_path:
+            original_base_names.append(os.path.splitext(os.path.basename(latent_path))[0])
+            seed = 0
+            if os.path.splitext(latent_path)[1] != ".safetensors":
+                latents = torch.load(latent_path, map_location="cpu")
+            else:
+                latents = load_file(latent_path)["latent"]
+                with safe_open(latent_path, framework="pt") as f:
+                    metadata = f.metadata()
+                if metadata is None:
+                    metadata = {}
+                logger.info(f"Loaded metadata: {metadata}")
+                if "seeds" in metadata:
+                    seed = int(metadata["seeds"])
+                if "height" in metadata and "width" in metadata:
+                    height = int(metadata["height"])
+                    width = int(metadata["width"])
+                    args.video_size = [height, width]
+                if "video_seconds" in metadata:
+                    args.video_seconds = float(metadata["video_seconds"])
+            seeds.append(seed)
+            logger.info(f"Loaded latent from {latent_path}. Shape: {latents.shape}")
+            if latents.ndim == 5:  # [BCTHW]
+                latents = latents.squeeze(0)  # [CTHW]
+            latents_list.append(latents)
+        # latent = torch.stack(latents_list, dim=0)  # [N, ...], must be same shape
+        for i, latent in enumerate(latents_list):
+            args.seed = seeds[i]
+            vae = load_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, device)
+            save_output(args, vae, latent, device, original_base_names)
+    elif args.from_file:
+        # Batch mode from file
+        # Read prompts from file
+        with open(args.from_file, "r", encoding="utf-8") as f:
+            prompt_lines = f.readlines()
+        # Process prompts
+        prompts_data = preprocess_prompts_for_batch(prompt_lines, args)
+        process_batch_prompts(prompts_data, args)
+    elif args.interactive:
+        # Interactive mode
+        process_interactive(args)
+    else:
+        # Single prompt mode (original behavior)
+        # Generate latent
+        gen_settings = get_generation_settings(args)
+        vae, latent = generate(args, gen_settings)
+        # print(f"Generated latent shape: {latent.shape}")
+        if args.save_merged_model:
+            return
+        # Save latent and video
+        save_output(args, vae, latent[0], device)
+    logger.info("Done!")
+if __name__ == "__main__":
+    main()

fpack_train_network.py ADDED Viewed

	@@ -0,0 +1,617 @@

+import argparse
+import gc
+import math
+import time
+from typing import Optional
+from PIL import Image
+import numpy as np
+import torch
+import torchvision.transforms.functional as TF
+from tqdm import tqdm
+from accelerate import Accelerator, init_empty_weights
+from dataset import image_video_dataset
+from dataset.image_video_dataset import ARCHITECTURE_FRAMEPACK, ARCHITECTURE_FRAMEPACK_FULL, load_video
+from fpack_generate_video import decode_latent
+from frame_pack import hunyuan
+from frame_pack.clip_vision import hf_clip_vision_encode
+from frame_pack.framepack_utils import load_image_encoders, load_text_encoder1, load_text_encoder2
+from frame_pack.framepack_utils import load_vae as load_framepack_vae
+from frame_pack.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked, load_packed_model
+from frame_pack.k_diffusion_hunyuan import sample_hunyuan
+from frame_pack.utils import crop_or_pad_yield_mask
+from dataset.image_video_dataset import resize_image_to_bucket
+from hv_train_network import NetworkTrainer, load_prompts, clean_memory_on_device, setup_parser_common, read_config_from_file
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+from utils import model_utils
+from utils.safetensors_utils import load_safetensors, MemoryEfficientSafeOpen
+class FramePackNetworkTrainer(NetworkTrainer):
+    def __init__(self):
+        super().__init__()
+    # region model specific
+    @property
+    def architecture(self) -> str:
+        return ARCHITECTURE_FRAMEPACK
+    @property
+    def architecture_full_name(self) -> str:
+        return ARCHITECTURE_FRAMEPACK_FULL
+    def handle_model_specific_args(self, args):
+        self._i2v_training = True
+        self._control_training = False
+        self.default_guidance_scale = 10.0  # embeded guidance scale
+    def process_sample_prompts(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        sample_prompts: str,
+    ):
+        device = accelerator.device
+        logger.info(f"cache Text Encoder outputs for sample prompt: {sample_prompts}")
+        prompts = load_prompts(sample_prompts)
+        # load text encoder
+        tokenizer1, text_encoder1 = load_text_encoder1(args, args.fp8_llm, device)
+        tokenizer2, text_encoder2 = load_text_encoder2(args)
+        text_encoder2.to(device)
+        sample_prompts_te_outputs = {}  # (prompt) -> (t1 embeds, t1 mask, t2 embeds)
+        for prompt_dict in prompts:
+            for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                if p is None or p in sample_prompts_te_outputs:
+                    continue
+                logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                with torch.amp.autocast(device_type=device.type, dtype=text_encoder1.dtype), torch.no_grad():
+                    llama_vec, clip_l_pooler = hunyuan.encode_prompt_conds(p, text_encoder1, text_encoder2, tokenizer1, tokenizer2)
+                llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
+                llama_vec = llama_vec.to("cpu")
+                llama_attention_mask = llama_attention_mask.to("cpu")
+                clip_l_pooler = clip_l_pooler.to("cpu")
+                sample_prompts_te_outputs[p] = (llama_vec, llama_attention_mask, clip_l_pooler)
+        del text_encoder1, text_encoder2
+        clean_memory_on_device(device)
+        # image embedding for I2V training
+        feature_extractor, image_encoder = load_image_encoders(args)
+        image_encoder.to(device)
+        # encode image with image encoder
+        sample_prompts_image_embs = {}
+        for prompt_dict in prompts:
+            image_path = prompt_dict.get("image_path", None)
+            assert image_path is not None, "image_path should be set for I2V training"
+            if image_path in sample_prompts_image_embs:
+                continue
+            logger.info(f"Encoding image to image encoder context: {image_path}")
+            height = prompt_dict.get("height", 256)
+            width = prompt_dict.get("width", 256)
+            img = Image.open(image_path).convert("RGB")
+            img_np = np.array(img)  # PIL to numpy, HWC
+            img_np = image_video_dataset.resize_image_to_bucket(img_np, (width, height))  # returns a numpy array
+            with torch.no_grad():
+                image_encoder_output = hf_clip_vision_encode(img_np, feature_extractor, image_encoder)
+            image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
+            image_encoder_last_hidden_state = image_encoder_last_hidden_state.to("cpu")
+            sample_prompts_image_embs[image_path] = image_encoder_last_hidden_state
+        del image_encoder
+        clean_memory_on_device(device)
+        # prepare sample parameters
+        sample_parameters = []
+        for prompt_dict in prompts:
+            prompt_dict_copy = prompt_dict.copy()
+            p = prompt_dict.get("prompt", "")
+            llama_vec, llama_attention_mask, clip_l_pooler = sample_prompts_te_outputs[p]
+            prompt_dict_copy["llama_vec"] = llama_vec
+            prompt_dict_copy["llama_attention_mask"] = llama_attention_mask
+            prompt_dict_copy["clip_l_pooler"] = clip_l_pooler
+            p = prompt_dict.get("negative_prompt", "")
+            llama_vec, llama_attention_mask, clip_l_pooler = sample_prompts_te_outputs[p]
+            prompt_dict_copy["negative_llama_vec"] = llama_vec
+            prompt_dict_copy["negative_llama_attention_mask"] = llama_attention_mask
+            prompt_dict_copy["negative_clip_l_pooler"] = clip_l_pooler
+            p = prompt_dict.get("image_path", None)
+            prompt_dict_copy["image_encoder_last_hidden_state"] = sample_prompts_image_embs[p]
+            sample_parameters.append(prompt_dict_copy)
+        clean_memory_on_device(accelerator.device)
+        return sample_parameters
+    def do_inference(
+        self,
+        accelerator,
+        args,
+        sample_parameter,
+        vae,
+        dit_dtype,
+        transformer,
+        discrete_flow_shift,
+        sample_steps,
+        width,
+        height,
+        frame_count,
+        generator,
+        do_classifier_free_guidance,
+        guidance_scale,
+        cfg_scale,
+        image_path=None,
+        control_video_path=None,
+    ):
+        """architecture dependent inference"""
+        model: HunyuanVideoTransformer3DModelPacked = transformer
+        device = accelerator.device
+        if cfg_scale is None:
+            cfg_scale = 1.0
+        do_classifier_free_guidance = do_classifier_free_guidance and cfg_scale != 1.0
+        # prepare parameters
+        one_frame_mode = args.one_frame
+        if one_frame_mode:
+            one_frame_inference = set()
+            for mode in sample_parameter["one_frame"].split(","):
+                one_frame_inference.add(mode.strip())
+        else:
+            one_frame_inference = None
+        latent_window_size = args.latent_window_size  # default is 9
+        latent_f = (frame_count - 1) // 4 + 1
+        total_latent_sections = math.floor((latent_f - 1) / latent_window_size)
+        if total_latent_sections < 1 and not one_frame_mode:
+            logger.warning(f"Not enough frames for FramePack: {latent_f}, minimum: {latent_window_size*4+1}")
+            return None
+        latent_f = total_latent_sections * latent_window_size + 1
+        actual_frame_count = (latent_f - 1) * 4 + 1
+        if actual_frame_count != frame_count:
+            logger.info(f"Frame count mismatch: {actual_frame_count} != {frame_count}, trimming to {actual_frame_count}")
+        frame_count = actual_frame_count
+        num_frames = latent_window_size * 4 - 3
+        # prepare start and control latent
+        def encode_image(path):
+            image = Image.open(path)
+            if image.mode == "RGBA":
+                alpha = image.split()[-1]
+                image = image.convert("RGB")
+            else:
+                alpha = None
+            image = resize_image_to_bucket(image, (width, height))  # returns a numpy array
+            image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(1).unsqueeze(0).float()  # 1, C, 1, H, W
+            image = image / 127.5 - 1  # -1 to 1
+            return hunyuan.vae_encode(image, vae).to("cpu"), alpha
+        # VAE encoding
+        logger.info(f"Encoding image to latent space")
+        vae.to(device)
+        start_latent, _ = (
+            encode_image(image_path) if image_path else torch.zeros((1, 16, 1, height // 8, width // 8), dtype=torch.float32)
+        )
+        if one_frame_mode:
+            control_latents = []
+            control_alphas = []
+            if "control_image_path" in sample_parameter:
+                for control_image_path in sample_parameter["control_image_path"]:
+                    control_latent, control_alpha = encode_image(control_image_path)
+                    control_latents.append(control_latent)
+                    control_alphas.append(control_alpha)
+        else:
+            control_latents = None
+            control_alphas = None
+        vae.to("cpu")  # move VAE to CPU to save memory
+        clean_memory_on_device(device)
+        # sampilng
+        if not one_frame_mode:
+            f1_mode = args.f1
+            history_latents = torch.zeros((1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32)
+            if not f1_mode:
+                total_generated_latent_frames = 0
+                latent_paddings = reversed(range(total_latent_sections))
+            else:
+                total_generated_latent_frames = 1
+                history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
+                latent_paddings = [0] * total_latent_sections
+            if total_latent_sections > 4:
+                latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
+            latent_paddings = list(latent_paddings)
+            for loop_index in range(total_latent_sections):
+                latent_padding = latent_paddings[loop_index]
+                if not f1_mode:
+                    is_last_section = latent_padding == 0
+                    latent_padding_size = latent_padding * latent_window_size
+                    logger.info(f"latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}")
+                    indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
+                    (
+                        clean_latent_indices_pre,
+                        blank_indices,
+                        latent_indices,
+                        clean_latent_indices_post,
+                        clean_latent_2x_indices,
+                        clean_latent_4x_indices,
+                    ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
+                    clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+                    clean_latents_pre = start_latent.to(history_latents)
+                    clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, : 1 + 2 + 16, :, :].split(
+                        [1, 2, 16], dim=2
+                    )
+                    clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
+                else:
+                    indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
+                    (
+                        clean_latent_indices_start,
+                        clean_latent_4x_indices,
+                        clean_latent_2x_indices,
+                        clean_latent_1x_indices,
+                        latent_indices,
+                    ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
+                    clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
+                    clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]) :, :, :].split(
+                        [16, 2, 1], dim=2
+                    )
+                    clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
+                # if use_teacache:
+                #     transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
+                # else:
+                #     transformer.initialize_teacache(enable_teacache=False)
+                llama_vec = sample_parameter["llama_vec"].to(device, dtype=torch.bfloat16)
+                llama_attention_mask = sample_parameter["llama_attention_mask"].to(device)
+                clip_l_pooler = sample_parameter["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+                if cfg_scale == 1.0:
+                    llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
+                    llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
+                else:
+                    llama_vec_n = sample_parameter["negative_llama_vec"].to(device, dtype=torch.bfloat16)
+                    llama_attention_mask_n = sample_parameter["negative_llama_attention_mask"].to(device)
+                    clip_l_pooler_n = sample_parameter["negative_clip_l_pooler"].to(device, dtype=torch.bfloat16)
+                image_encoder_last_hidden_state = sample_parameter["image_encoder_last_hidden_state"].to(
+                    device, dtype=torch.bfloat16
+                )
+                generated_latents = sample_hunyuan(
+                    transformer=model,
+                    sampler=args.sample_solver,
+                    width=width,
+                    height=height,
+                    frames=num_frames,
+                    real_guidance_scale=cfg_scale,
+                    distilled_guidance_scale=guidance_scale,
+                    guidance_rescale=0.0,
+                    # shift=3.0,
+                    num_inference_steps=sample_steps,
+                    generator=generator,
+                    prompt_embeds=llama_vec,
+                    prompt_embeds_mask=llama_attention_mask,
+                    prompt_poolers=clip_l_pooler,
+                    negative_prompt_embeds=llama_vec_n,
+                    negative_prompt_embeds_mask=llama_attention_mask_n,
+                    negative_prompt_poolers=clip_l_pooler_n,
+                    device=device,
+                    dtype=torch.bfloat16,
+                    image_embeddings=image_encoder_last_hidden_state,
+                    latent_indices=latent_indices,
+                    clean_latents=clean_latents,
+                    clean_latent_indices=clean_latent_indices,
+                    clean_latents_2x=clean_latents_2x,
+                    clean_latent_2x_indices=clean_latent_2x_indices,
+                    clean_latents_4x=clean_latents_4x,
+                    clean_latent_4x_indices=clean_latent_4x_indices,
+                )
+                total_generated_latent_frames += int(generated_latents.shape[2])
+                if not f1_mode:
+                    if is_last_section:
+                        generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
+                        total_generated_latent_frames += 1
+                    history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
+                    real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
+                else:
+                    history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
+                    real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
+                logger.info(f"Generated. Latent shape {real_history_latents.shape}")
+        else:
+            # one frame mode
+            sample_num_frames = 1
+            latent_indices = torch.zeros((1, 1), dtype=torch.int64)  # 1x1 latent index for target image
+            latent_indices[:, 0] = latent_window_size  # last of latent_window
+            def get_latent_mask(mask_image: Image.Image):
+                mask_image = mask_image.resize((width // 8, height // 8), Image.LANCZOS)
+                mask_image = np.array(mask_image)  # PIL to numpy, HWC
+                mask_image = torch.from_numpy(mask_image).float() / 255.0  # 0 to 1.0, HWC
+                mask_image = mask_image.squeeze(-1)  # HWC -> HW
+                mask_image = mask_image.unsqueeze(0).unsqueeze(0).unsqueeze(0)  # HW -> 111HW (B, C, F, H, W)
+                mask_image = mask_image.to(torch.float32)
+                return mask_image
+            if control_latents is None or len(control_latents) == 0:
+                logger.info(f"No control images provided for one frame inference. Use zero latents for control images.")
+                control_latents = [torch.zeros(1, 16, 1, height // 8, width // 8, dtype=torch.float32)]
+            if "no_post" not in one_frame_inference:
+                # add zero latents as clean latents post
+                control_latents.append(torch.zeros((1, 16, 1, height // 8, width // 8), dtype=torch.float32))
+                logger.info(f"Add zero latents as clean latents post for one frame inference.")
+            # kisekaeichi and 1f-mc: both are using control images, but indices are different
+            clean_latents = torch.cat(control_latents, dim=2)  # (1, 16, num_control_images, H//8, W//8)
+            clean_latent_indices = torch.zeros((1, len(control_latents)), dtype=torch.int64)
+            if "no_post" not in one_frame_inference:
+                clean_latent_indices[:, -1] = 1 + latent_window_size  # default index for clean latents post
+            # apply mask for control latents (clean latents)
+            for i in range(len(control_alphas)):
+                control_alpha = control_alphas[i]
+                if control_alpha is not None:
+                    latent_mask = get_latent_mask(control_alpha)
+                    logger.info(
+                        f"Apply mask for clean latents 1x for {i+1}: shape: {latent_mask.shape}"
+                    )
+                    clean_latents[:, :, i : i + 1, :, :] = clean_latents[:, :, i : i + 1, :, :] * latent_mask
+            for one_frame_param in one_frame_inference:
+                if one_frame_param.startswith("target_index="):
+                    target_index = int(one_frame_param.split("=")[1])
+                    latent_indices[:, 0] = target_index
+                    logger.info(f"Set index for target: {target_index}")
+                elif one_frame_param.startswith("control_index="):
+                    control_indices = one_frame_param.split("=")[1].split(";")
+                    i = 0
+                    while i < len(control_indices) and i < clean_latent_indices.shape[1]:
+                        control_index = int(control_indices[i])
+                        clean_latent_indices[:, i] = control_index
+                        i += 1
+                    logger.info(f"Set index for clean latent 1x: {control_indices}")
+            if "no_2x" in one_frame_inference:
+                clean_latents_2x = None
+                clean_latent_2x_indices = None
+                logger.info(f"No clean_latents_2x")
+            else:
+                clean_latents_2x = torch.zeros((1, 16, 2, height // 8, width // 8), dtype=torch.float32)
+                index = 1 + latent_window_size + 1
+                clean_latent_2x_indices = torch.arange(index, index + 2)  #  2
+            if "no_4x" in one_frame_inference:
+                clean_latents_4x = None
+                clean_latent_4x_indices = None
+                logger.info(f"No clean_latents_4x")
+            else:
+                index = 1 + latent_window_size + 1 + 2
+                clean_latent_4x_indices = torch.arange(index, index + 16)  #  16
+            logger.info(
+                f"One frame inference. clean_latent: {clean_latents.shape} latent_indices: {latent_indices}, clean_latent_indices: {clean_latent_indices}, num_frames: {sample_num_frames}"
+            )
+            # prepare conditioning inputs
+            llama_vec = sample_parameter["llama_vec"].to(device, dtype=torch.bfloat16)
+            llama_attention_mask = sample_parameter["llama_attention_mask"].to(device)
+            clip_l_pooler = sample_parameter["clip_l_pooler"].to(device, dtype=torch.bfloat16)
+            if cfg_scale == 1.0:
+                llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
+                llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
+            else:
+                llama_vec_n = sample_parameter["negative_llama_vec"].to(device, dtype=torch.bfloat16)
+                llama_attention_mask_n = sample_parameter["negative_llama_attention_mask"].to(device)
+                clip_l_pooler_n = sample_parameter["negative_clip_l_pooler"].to(device, dtype=torch.bfloat16)
+            image_encoder_last_hidden_state = sample_parameter["image_encoder_last_hidden_state"].to(
+                device, dtype=torch.bfloat16
+            )
+            generated_latents = sample_hunyuan(
+                transformer=model,
+                sampler=args.sample_solver,
+                width=width,
+                height=height,
+                frames=1,
+                real_guidance_scale=cfg_scale,
+                distilled_guidance_scale=guidance_scale,
+                guidance_rescale=0.0,
+                # shift=3.0,
+                num_inference_steps=sample_steps,
+                generator=generator,
+                prompt_embeds=llama_vec,
+                prompt_embeds_mask=llama_attention_mask,
+                prompt_poolers=clip_l_pooler,
+                negative_prompt_embeds=llama_vec_n,
+                negative_prompt_embeds_mask=llama_attention_mask_n,
+                negative_prompt_poolers=clip_l_pooler_n,
+                device=device,
+                dtype=torch.bfloat16,
+                image_embeddings=image_encoder_last_hidden_state,
+                latent_indices=latent_indices,
+                clean_latents=clean_latents,
+                clean_latent_indices=clean_latent_indices,
+                clean_latents_2x=clean_latents_2x,
+                clean_latent_2x_indices=clean_latent_2x_indices,
+                clean_latents_4x=clean_latents_4x,
+                clean_latent_4x_indices=clean_latent_4x_indices,
+            )
+            real_history_latents = generated_latents.to(clean_latents)
+        # wait for 5 seconds until block swap is done
+        logger.info("Waiting for 5 seconds to finish block swap")
+        time.sleep(5)
+        gc.collect()
+        clean_memory_on_device(device)
+        video = decode_latent(
+            latent_window_size, total_latent_sections, args.bulk_decode, vae, real_history_latents, device, one_frame_mode
+        )
+        video = video.to("cpu", dtype=torch.float32).unsqueeze(0)  # add batch dimension
+        video = (video / 2 + 0.5).clamp(0, 1)  # -1 to 1 -> 0 to 1
+        clean_memory_on_device(device)
+        return video
+    def load_vae(self, args: argparse.Namespace, vae_dtype: torch.dtype, vae_path: str):
+        vae_path = args.vae
+        logger.info(f"Loading VAE model from {vae_path}")
+        vae = load_framepack_vae(args.vae, args.vae_chunk_size, args.vae_spatial_tile_sample_min_size, "cpu")
+        return vae
+    def load_transformer(
+        self,
+        accelerator: Accelerator,
+        args: argparse.Namespace,
+        dit_path: str,
+        attn_mode: str,
+        split_attn: bool,
+        loading_device: str,
+        dit_weight_dtype: Optional[torch.dtype],
+    ):
+        logger.info(f"Loading DiT model from {dit_path}")
+        device = accelerator.device
+        model = load_packed_model(device, dit_path, attn_mode, loading_device, args.fp8_scaled, split_attn)
+        return model
+    def scale_shift_latents(self, latents):
+        # FramePack VAE includes scaling
+        return latents
+    def call_dit(
+        self,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
+        transformer,
+        latents: torch.Tensor,
+        batch: dict[str, torch.Tensor],
+        noise: torch.Tensor,
+        noisy_model_input: torch.Tensor,
+        timesteps: torch.Tensor,
+        network_dtype: torch.dtype,
+    ):
+        model: HunyuanVideoTransformer3DModelPacked = transformer
+        device = accelerator.device
+        batch_size = latents.shape[0]
+        # maybe model.dtype is better than network_dtype...
+        distilled_guidance = torch.tensor([args.guidance_scale * 1000.0] * batch_size).to(device=device, dtype=network_dtype)
+        latents = latents.to(device=accelerator.device, dtype=network_dtype)
+        noisy_model_input = noisy_model_input.to(device=accelerator.device, dtype=network_dtype)
+        # for k, v in batch.items():
+        #     if isinstance(v, torch.Tensor):
+        #         print(f"{k}: {v.shape} {v.dtype} {v.device}")
+        with accelerator.autocast():
+            clean_latent_2x_indices = batch["clean_latent_2x_indices"] if "clean_latent_2x_indices" in batch else None
+            if clean_latent_2x_indices is not None:
+                clean_latent_2x = batch["latents_clean_2x"] if "latents_clean_2x" in batch else None
+                if clean_latent_2x is None:
+                    clean_latent_2x = torch.zeros(
+                        (batch_size, 16, 2, latents.shape[3], latents.shape[4]), dtype=latents.dtype, device=latents.device
+                    )
+            else:
+                clean_latent_2x = None
+            clean_latent_4x_indices = batch["clean_latent_4x_indices"] if "clean_latent_4x_indices" in batch else None
+            if clean_latent_4x_indices is not None:
+                clean_latent_4x = batch["latents_clean_4x"] if "latents_clean_4x" in batch else None
+                if clean_latent_4x is None:
+                    clean_latent_4x = torch.zeros(
+                        (batch_size, 16, 16, latents.shape[3], latents.shape[4]), dtype=latents.dtype, device=latents.device
+                    )
+            else:
+                clean_latent_4x = None
+            model_pred = model(
+                hidden_states=noisy_model_input,
+                timestep=timesteps,
+                encoder_hidden_states=batch["llama_vec"],
+                encoder_attention_mask=batch["llama_attention_mask"],
+                pooled_projections=batch["clip_l_pooler"],
+                guidance=distilled_guidance,
+                latent_indices=batch["latent_indices"],
+                clean_latents=batch["latents_clean"],
+                clean_latent_indices=batch["clean_latent_indices"],
+                clean_latents_2x=clean_latent_2x,
+                clean_latent_2x_indices=clean_latent_2x_indices,
+                clean_latents_4x=clean_latent_4x,
+                clean_latent_4x_indices=clean_latent_4x_indices,
+                image_embeddings=batch["image_embeddings"],
+                return_dict=False,
+            )
+            model_pred = model_pred[0]  # returns tuple (model_pred, )
+        # flow matching loss
+        target = noise - latents
+        return model_pred, target
+    # endregion model specific
+def framepack_setup_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    """FramePack specific parser setup"""
+    parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT / DiTにスケーリングされたfp8を使う")
+    parser.add_argument("--fp8_llm", action="store_true", help="use fp8 for LLM / LLMにfp8を使う")
+    parser.add_argument("--text_encoder1", type=str, help="Text Encoder 1 directory / テキストエンコーダ1のディレクトリ")
+    parser.add_argument("--text_encoder2", type=str, help="Text Encoder 2 directory / テキストエンコーダ2のディレクトリ")
+    parser.add_argument("--vae_chunk_size", type=int, default=None, help="chunk size for CausalConv3d in VAE")
+    parser.add_argument(
+        "--vae_spatial_tile_sample_min_size", type=int, default=None, help="spatial tile sample min size for VAE, default 256"
+    )
+    parser.add_argument("--image_encoder", type=str, required=True, help="Image encoder (CLIP) checkpoint path or directory")
+    parser.add_argument("--latent_window_size", type=int, default=9, help="FramePack latent window size (default 9)")
+    parser.add_argument("--bulk_decode", action="store_true", help="decode all frames at once in sample generation")
+    parser.add_argument("--f1", action="store_true", help="Use F1 sampling method for sample generation")
+    parser.add_argument("--one_frame", action="store_true", help="Use one frame sampling method for sample generation")
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser_common()
+    parser = framepack_setup_parser(parser)
+    args = parser.parse_args()
+    args = read_config_from_file(args, parser)
+    assert (
+        args.vae_dtype is None or args.vae_dtype == "float16"
+    ), "VAE dtype must be float16 / VAEのdtypeはfloat16でなければなりません"
+    args.vae_dtype = "float16"  # fixed
+    args.dit_dtype = "bfloat16"  # fixed
+    args.sample_solver = "unipc"  # for sample generation, fixed to unipc
+    trainer = FramePackNetworkTrainer()
+    trainer.train(args)

frame_pack/__init__.py ADDED Viewed

File without changes

frame_pack/bucket_tools.py ADDED Viewed

	@@ -0,0 +1,30 @@

+bucket_options = {
+    640: [
+        (416, 960),
+        (448, 864),
+        (480, 832),
+        (512, 768),
+        (544, 704),
+        (576, 672),
+        (608, 640),
+        (640, 608),
+        (672, 576),
+        (704, 544),
+        (768, 512),
+        (832, 480),
+        (864, 448),
+        (960, 416),
+    ],
+}
+def find_nearest_bucket(h, w, resolution=640):
+    min_metric = float('inf')
+    best_bucket = None
+    for (bucket_h, bucket_w) in bucket_options[resolution]:
+        metric = abs(h * bucket_w - w * bucket_h)
+        if metric <= min_metric:
+            min_metric = metric
+            best_bucket = (bucket_h, bucket_w)
+    return best_bucket

frame_pack/clip_vision.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import numpy as np
+def hf_clip_vision_encode(image, feature_extractor, image_encoder):
+    assert isinstance(image, np.ndarray)
+    assert image.ndim == 3 and image.shape[2] == 3
+    assert image.dtype == np.uint8
+    preprocessed = feature_extractor.preprocess(images=image, return_tensors="pt").to(
+        device=image_encoder.device, dtype=image_encoder.dtype
+    )
+    image_encoder_output = image_encoder(**preprocessed)
+    return image_encoder_output

frame_pack/framepack_utils.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import os
+import logging
+from types import SimpleNamespace
+from typing import Optional, Union
+import accelerate
+from accelerate import Accelerator, init_empty_weights
+import torch
+from safetensors.torch import load_file
+from transformers import (
+    LlamaTokenizerFast,
+    LlamaConfig,
+    LlamaModel,
+    CLIPTokenizer,
+    CLIPTextModel,
+    CLIPConfig,
+    SiglipImageProcessor,
+    SiglipVisionModel,
+    SiglipVisionConfig,
+)
+from utils.safetensors_utils import load_split_weights
+from hunyuan_model.vae import load_vae as hunyuan_load_vae
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def load_vae(
+    vae_path: str, vae_chunk_size: Optional[int], vae_spatial_tile_sample_min_size: Optional[int], device: Union[str, torch.device]
+):
+    # single file and directory (contains 'vae') support
+    if os.path.isdir(vae_path):
+        vae_path = os.path.join(vae_path, "vae", "diffusion_pytorch_model.safetensors")
+    else:
+        vae_path = vae_path
+    vae_dtype = torch.float16  # if vae_dtype is None else str_to_dtype(vae_dtype)
+    vae, _, s_ratio, t_ratio = hunyuan_load_vae(vae_dtype=vae_dtype, device=device, vae_path=vae_path)
+    vae.eval()
+    # vae_kwargs = {"s_ratio": s_ratio, "t_ratio": t_ratio}
+    # set chunk_size to CausalConv3d recursively
+    chunk_size = vae_chunk_size
+    if chunk_size is not None:
+        vae.set_chunk_size_for_causal_conv_3d(chunk_size)
+        logger.info(f"Set chunk_size to {chunk_size} for CausalConv3d")
+    if vae_spatial_tile_sample_min_size is not None:
+        vae.enable_spatial_tiling(True)
+        vae.tile_sample_min_size = vae_spatial_tile_sample_min_size
+        vae.tile_latent_min_size = vae_spatial_tile_sample_min_size // 8
+        logger.info(f"Enabled spatial tiling with min size {vae_spatial_tile_sample_min_size}")
+    # elif vae_tiling:
+    else:
+        vae.enable_spatial_tiling(True)
+    return vae
+# region Text Encoders
+# Text Encoder configs are copied from HunyuanVideo repo
+LLAMA_CONFIG = {
+    "architectures": ["LlamaModel"],
+    "attention_bias": False,
+    "attention_dropout": 0.0,
+    "bos_token_id": 128000,
+    "eos_token_id": 128001,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 14336,
+    "max_position_embeddings": 8192,
+    "mlp_bias": False,
+    "model_type": "llama",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 8,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": None,
+    "rope_theta": 500000.0,
+    "tie_word_embeddings": False,
+    "torch_dtype": "float16",
+    "transformers_version": "4.46.3",
+    "use_cache": True,
+    "vocab_size": 128320,
+}
+CLIP_CONFIG = {
+    #   "_name_or_path": "/raid/aryan/llava-llama-3-8b-v1_1-extracted/text_encoder_2",
+    "architectures": ["CLIPTextModel"],
+    "attention_dropout": 0.0,
+    "bos_token_id": 0,
+    "dropout": 0.0,
+    "eos_token_id": 2,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 77,
+    "model_type": "clip_text_model",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 1,
+    "projection_dim": 768,
+    "torch_dtype": "float16",
+    "transformers_version": "4.48.0.dev0",
+    "vocab_size": 49408,
+}
+def load_text_encoder1(
+    args, fp8_llm: Optional[bool] = False, device: Optional[Union[str, torch.device]] = None
+) -> tuple[LlamaTokenizerFast, LlamaModel]:
+    # single file, split file and directory (contains 'text_encoder') support
+    logger.info(f"Loading text encoder 1 tokenizer")
+    tokenizer1 = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer")
+    logger.info(f"Loading text encoder 1 from {args.text_encoder1}")
+    if os.path.isdir(args.text_encoder1):
+        # load from directory, configs are in the directory
+        text_encoder1 = LlamaModel.from_pretrained(args.text_encoder1, subfolder="text_encoder", torch_dtype=torch.float16)
+    else:
+        # load from file, we create the model with the appropriate config
+        config = LlamaConfig(**LLAMA_CONFIG)
+        with init_empty_weights():
+            text_encoder1 = LlamaModel._from_config(config, torch_dtype=torch.float16)
+        state_dict = load_split_weights(args.text_encoder1)
+        # support weights from ComfyUI
+        if "model.embed_tokens.weight" in state_dict:
+            for key in list(state_dict.keys()):
+                if key.startswith("model."):
+                    new_key = key.replace("model.", "")
+                    state_dict[new_key] = state_dict[key]
+                    del state_dict[key]
+        if "tokenizer" in state_dict:
+            state_dict.pop("tokenizer")
+        if "lm_head.weight" in state_dict:
+            state_dict.pop("lm_head.weight")
+        # # support weights from ComfyUI
+        # if "tokenizer" in state_dict:
+        #     state_dict.pop("tokenizer")
+        text_encoder1.load_state_dict(state_dict, strict=True, assign=True)
+    if fp8_llm:
+        org_dtype = text_encoder1.dtype
+        logger.info(f"Moving and casting text encoder to {device} and torch.float8_e4m3fn")
+        text_encoder1.to(device=device, dtype=torch.float8_e4m3fn)
+        # prepare LLM for fp8
+        def prepare_fp8(llama_model: LlamaModel, target_dtype):
+            def forward_hook(module):
+                def forward(hidden_states):
+                    input_dtype = hidden_states.dtype
+                    hidden_states = hidden_states.to(torch.float32)
+                    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+                    hidden_states = hidden_states * torch.rsqrt(variance + module.variance_epsilon)
+                    return module.weight.to(input_dtype) * hidden_states.to(input_dtype)
+                return forward
+            for module in llama_model.modules():
+                if module.__class__.__name__ in ["Embedding"]:
+                    # print("set", module.__class__.__name__, "to", target_dtype)
+                    module.to(target_dtype)
+                if module.__class__.__name__ in ["LlamaRMSNorm"]:
+                    # print("set", module.__class__.__name__, "hooks")
+                    module.forward = forward_hook(module)
+        prepare_fp8(text_encoder1, org_dtype)
+    else:
+        text_encoder1.to(device)
+    text_encoder1.eval()
+    return tokenizer1, text_encoder1
+def load_text_encoder2(args) -> tuple[CLIPTokenizer, CLIPTextModel]:
+    # single file and directory (contains 'text_encoder_2') support
+    logger.info(f"Loading text encoder 2 tokenizer")
+    tokenizer2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer_2")
+    logger.info(f"Loading text encoder 2 from {args.text_encoder2}")
+    if os.path.isdir(args.text_encoder2):
+        # load from directory, configs are in the directory
+        text_encoder2 = CLIPTextModel.from_pretrained(args.text_encoder2, subfolder="text_encoder_2", torch_dtype=torch.float16)
+    else:
+        # we only have one file, so we can load it directly
+        config = CLIPConfig(**CLIP_CONFIG)
+        with init_empty_weights():
+            text_encoder2 = CLIPTextModel._from_config(config, torch_dtype=torch.float16)
+        state_dict = load_file(args.text_encoder2)
+        text_encoder2.load_state_dict(state_dict, strict=True, assign=True)
+    text_encoder2.eval()
+    return tokenizer2, text_encoder2
+# endregion
+# region image encoder
+# Siglip configs are copied from FramePack repo
+FEATURE_EXTRACTOR_CONFIG = {
+    "do_convert_rgb": None,
+    "do_normalize": True,
+    "do_rescale": True,
+    "do_resize": True,
+    "image_mean": [0.5, 0.5, 0.5],
+    "image_processor_type": "SiglipImageProcessor",
+    "image_std": [0.5, 0.5, 0.5],
+    "processor_class": "SiglipProcessor",
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {"height": 384, "width": 384},
+}
+IMAGE_ENCODER_CONFIG = {
+    "_name_or_path": "/home/lvmin/.cache/huggingface/hub/models--black-forest-labs--FLUX.1-Redux-dev/snapshots/1282f955f706b5240161278f2ef261d2a29ad649/image_encoder",
+    "architectures": ["SiglipVisionModel"],
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 384,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 14,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.46.2",
+}
+def load_image_encoders(args):
+    logger.info(f"Loading image encoder feature extractor")
+    feature_extractor = SiglipImageProcessor(**FEATURE_EXTRACTOR_CONFIG)
+    # single file, split file and directory (contains 'image_encoder') support
+    logger.info(f"Loading image encoder from {args.image_encoder}")
+    if os.path.isdir(args.image_encoder):
+        # load from directory, configs are in the directory
+        image_encoder = SiglipVisionModel.from_pretrained(args.image_encoder, subfolder="image_encoder", torch_dtype=torch.float16)
+    else:
+        # load from file, we create the model with the appropriate config
+        config = SiglipVisionConfig(**IMAGE_ENCODER_CONFIG)
+        with init_empty_weights():
+            image_encoder = SiglipVisionModel._from_config(config, torch_dtype=torch.float16)
+        state_dict = load_file(args.image_encoder)
+        image_encoder.load_state_dict(state_dict, strict=True, assign=True)
+    image_encoder.eval()
+    return feature_extractor, image_encoder
+# endregion

frame_pack/hunyuan.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# original code: https://github.com/lllyasviel/FramePack
+# original license: Apache-2.0
+import torch
+# from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video import DEFAULT_PROMPT_TEMPLATE
+# from diffusers_helper.utils import crop_or_pad_yield_mask
+from hunyuan_model.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from hunyuan_model.text_encoder import PROMPT_TEMPLATE
+@torch.no_grad()
+def encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2, max_length=256, custom_system_prompt=None):
+    assert isinstance(prompt, str)
+    prompt = [prompt]
+    # LLAMA
+    # We can verify crop_start by checking the token count of the prompt:
+    # custom_system_prompt = (
+    #     "Describe the video by detailing the following aspects: "
+    #     "1. The main content and theme of the video."
+    #     "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
+    #     "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
+    #     "4. background environment, light, style and atmosphere."
+    #     "5. camera angles, movements, and transitions used in the video:"
+    # )
+    if custom_system_prompt is None:
+        prompt_llama = [PROMPT_TEMPLATE["dit-llm-encode-video"]["template"].format(p) for p in prompt]
+        crop_start = PROMPT_TEMPLATE["dit-llm-encode-video"]["crop_start"]
+    else:
+        # count tokens for custom_system_prompt
+        full_prompt = f"<|start_header_id|>system<|end_header_id|>\n\n{custom_system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
+        print(f"Custom system prompt: {full_prompt}")
+        system_prompt_tokens = tokenizer(full_prompt, return_tensors="pt", truncation=True).input_ids[0].shape[0]
+        print(f"Custom system prompt token count: {system_prompt_tokens}")
+        prompt_llama = [full_prompt + p + "<|eot_id|>" for p in prompt]
+        crop_start = system_prompt_tokens
+    llama_inputs = tokenizer(
+        prompt_llama,
+        padding="max_length",
+        max_length=max_length + crop_start,
+        truncation=True,
+        return_tensors="pt",
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_attention_mask=True,
+    )
+    llama_input_ids = llama_inputs.input_ids.to(text_encoder.device)
+    llama_attention_mask = llama_inputs.attention_mask.to(text_encoder.device)
+    llama_attention_length = int(llama_attention_mask.sum())
+    llama_outputs = text_encoder(
+        input_ids=llama_input_ids,
+        attention_mask=llama_attention_mask,
+        output_hidden_states=True,
+    )
+    llama_vec = llama_outputs.hidden_states[-3][:, crop_start:llama_attention_length]
+    # llama_vec_remaining = llama_outputs.hidden_states[-3][:, llama_attention_length:]
+    llama_attention_mask = llama_attention_mask[:, crop_start:llama_attention_length]
+    assert torch.all(llama_attention_mask.bool())
+    # CLIP
+    clip_l_input_ids = tokenizer_2(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_overflowing_tokens=False,
+        return_length=False,
+        return_tensors="pt",
+    ).input_ids
+    clip_l_pooler = text_encoder_2(clip_l_input_ids.to(text_encoder_2.device), output_hidden_states=False).pooler_output
+    return llama_vec, clip_l_pooler
+@torch.no_grad()
+def vae_decode_fake(latents):
+    latent_rgb_factors = [
+        [-0.0395, -0.0331, 0.0445],
+        [0.0696, 0.0795, 0.0518],
+        [0.0135, -0.0945, -0.0282],
+        [0.0108, -0.0250, -0.0765],
+        [-0.0209, 0.0032, 0.0224],
+        [-0.0804, -0.0254, -0.0639],
+        [-0.0991, 0.0271, -0.0669],
+        [-0.0646, -0.0422, -0.0400],
+        [-0.0696, -0.0595, -0.0894],
+        [-0.0799, -0.0208, -0.0375],
+        [0.1166, 0.1627, 0.0962],
+        [0.1165, 0.0432, 0.0407],
+        [-0.2315, -0.1920, -0.1355],
+        [-0.0270, 0.0401, -0.0821],
+        [-0.0616, -0.0997, -0.0727],
+        [0.0249, -0.0469, -0.1703],
+    ]  # From comfyui
+    latent_rgb_factors_bias = [0.0259, -0.0192, -0.0761]
+    weight = torch.tensor(latent_rgb_factors, device=latents.device, dtype=latents.dtype).transpose(0, 1)[:, :, None, None, None]
+    bias = torch.tensor(latent_rgb_factors_bias, device=latents.device, dtype=latents.dtype)
+    images = torch.nn.functional.conv3d(latents, weight, bias=bias, stride=1, padding=0, dilation=1, groups=1)
+    images = images.clamp(0.0, 1.0)
+    return images
+@torch.no_grad()
+def vae_decode(latents, vae, image_mode=False) -> torch.Tensor:
+    latents = latents / vae.config.scaling_factor
+    if not image_mode:
+        image = vae.decode(latents.to(device=vae.device, dtype=vae.dtype)).sample
+    else:
+        latents = latents.to(device=vae.device, dtype=vae.dtype).unbind(2)
+        image = [vae.decode(l.unsqueeze(2)).sample for l in latents]
+        image = torch.cat(image, dim=2)
+    return image
+@torch.no_grad()
+def vae_encode(image, vae: AutoencoderKLCausal3D) -> torch.Tensor:
+    latents = vae.encode(image.to(device=vae.device, dtype=vae.dtype)).latent_dist.sample()
+    latents = latents * vae.config.scaling_factor
+    return latents

frame_pack/hunyuan_video_packed.py ADDED Viewed

	@@ -0,0 +1,2038 @@

+# original code: https://github.com/lllyasviel/FramePack
+# original license: Apache-2.0
+import glob
+import math
+import numbers
+import os
+from types import SimpleNamespace
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import einops
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from modules.custom_offloading_utils import ModelOffloader
+from utils.safetensors_utils import load_split_weights
+from modules.fp8_optimization_utils import apply_fp8_monkey_patch, optimize_state_dict_with_fp8
+from accelerate import init_empty_weights
+try:
+    # raise NotImplementedError
+    from xformers.ops import memory_efficient_attention as xformers_attn_func
+    print("Xformers is installed!")
+except:
+    print("Xformers is not installed!")
+    xformers_attn_func = None
+try:
+    # raise NotImplementedError
+    from flash_attn import flash_attn_varlen_func, flash_attn_func
+    print("Flash Attn is installed!")
+except:
+    print("Flash Attn is not installed!")
+    flash_attn_varlen_func = None
+    flash_attn_func = None
+try:
+    # raise NotImplementedError
+    from sageattention import sageattn_varlen, sageattn
+    print("Sage Attn is installed!")
+except:
+    print("Sage Attn is not installed!")
+    sageattn_varlen = None
+    sageattn = None
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+# region diffusers
+# copied from diffusers with some modifications to minimize dependencies
+# original code: https://github.com/huggingface/diffusers/
+# original license: Apache-2.0
+ACT2CLS = {
+    "swish": nn.SiLU,
+    "silu": nn.SiLU,
+    "mish": nn.Mish,
+    "gelu": nn.GELU,
+    "relu": nn.ReLU,
+}
+def get_activation(act_fn: str) -> nn.Module:
+    """Helper function to get activation function from string.
+    Args:
+        act_fn (str): Name of activation function.
+    Returns:
+        nn.Module: Activation function.
+    """
+    act_fn = act_fn.lower()
+    if act_fn in ACT2CLS:
+        return ACT2CLS[act_fn]()
+    else:
+        raise ValueError(f"activation function {act_fn} not found in ACT2FN mapping {list(ACT2CLS.keys())}")
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device)
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+        sample_proj_bias=True,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+        self.act = get_activation(act_fn)
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+        self.scale = scale
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+            scale=self.scale,
+        )
+        return t_emb
+class FP32SiLU(nn.Module):
+    r"""
+    SiLU activation function with input upcasted to torch.float32.
+    """
+    def __init__(self):
+        super().__init__()
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        return F.silu(inputs.float(), inplace=False).to(inputs.dtype)
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.approximate = approximate
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        # if gate.device.type == "mps" and is_torch_version("<", "2.0.0"):
+        #     # fp16 gelu not supported on mps before torch 2.0
+        #     return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
+        return F.gelu(gate, approximate=self.approximate)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+class PixArtAlphaTextProjection(nn.Module):
+    """
+    Projects caption embeddings. Also handles dropout for classifier-free guidance.
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+    def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh"):
+        super().__init__()
+        if out_features is None:
+            out_features = hidden_size
+        self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
+        if act_fn == "gelu_tanh":
+            self.act_1 = nn.GELU(approximate="tanh")
+        elif act_fn == "silu":
+            self.act_1 = nn.SiLU()
+        elif act_fn == "silu_fp32":
+            self.act_1 = FP32SiLU()
+        else:
+            raise ValueError(f"Unknown activation function: {act_fn}")
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=out_features, bias=True)
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+class LayerNormFramePack(nn.LayerNorm):
+    # casting to dtype of input tensor is added
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps).to(x)
+class FP32LayerNormFramePack(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        origin_dtype = x.dtype
+        return torch.nn.functional.layer_norm(
+            x.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        ).to(origin_dtype)
+class RMSNormFramePack(nn.Module):
+    r"""
+    RMS Norm as introduced in https://arxiv.org/abs/1910.07467 by Zhang et al.
+    Args:
+        dim (`int`): Number of dimensions to use for `weights`. Only effective when `elementwise_affine` is True.
+        eps (`float`): Small value to use when calculating the reciprocal of the square-root.
+        elementwise_affine (`bool`, defaults to `True`):
+            Boolean flag to denote if affine transformation should be applied.
+        bias (`bool`, defaults to False): If also training the `bias` param.
+    """
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True, bias: bool = False):
+        super().__init__()
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if isinstance(dim, numbers.Integral):
+            dim = (dim,)
+        self.dim = torch.Size(dim)
+        self.weight = None
+        self.bias = None
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+            if bias:
+                self.bias = nn.Parameter(torch.zeros(dim))
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        if self.weight is None:
+            return hidden_states.to(input_dtype)
+        return hidden_states.to(input_dtype) * self.weight.to(input_dtype)
+class AdaLayerNormContinuousFramePack(nn.Module):
+    r"""
+    Adaptive normalization layer with a norm layer (layer_norm or rms_norm).
+    Args:
+        embedding_dim (`int`): Embedding dimension to use during projection.
+        conditioning_embedding_dim (`int`): Dimension of the input condition.
+        elementwise_affine (`bool`, defaults to `True`):
+            Boolean flag to denote if affine transformation should be applied.
+        eps (`float`, defaults to 1e-5): Epsilon factor.
+        bias (`bias`, defaults to `True`): Boolean flag to denote if bias should be use.
+        norm_type (`str`, defaults to `"layer_norm"`):
+            Normalization layer to use. Values supported: "layer_norm", "rms_norm".
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNormFramePack(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNormFramePack(embedding_dim, eps, elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward(self, x, conditioning_embedding):
+        emb = self.linear(self.silu(conditioning_embedding))
+        scale, shift = emb.chunk(2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+class LinearActivation(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True, activation: str = "silu"):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.activation = get_activation(activation)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        return self.activation(hidden_states)
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        # if activation_fn == "gelu":
+        #     act_fn = GELU(dim, inner_dim, bias=bias)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        # elif activation_fn == "geglu":
+        #     act_fn = GEGLU(dim, inner_dim, bias=bias)
+        # elif activation_fn == "geglu-approximate":
+        #     act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        # elif activation_fn == "swiglu":
+        #     act_fn = SwiGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "linear-silu":
+            act_fn = LinearActivation(dim, inner_dim, bias=bias, activation="silu")
+        else:
+            raise ValueError(f"Unknown activation function: {activation_fn}")
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            # deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            # deprecate("scale", "1.0.0", deprecation_message)
+            raise ValueError("scale is not supported in this version. Please remove it.")
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+# @maybe_allow_in_graph
+class Attention(nn.Module):
+    r"""
+    Minimal copy of Attention class from diffusers.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        bias: bool = False,
+        qk_norm: Optional[str] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        eps: float = 1e-5,
+        processor: Optional[any] = None,
+        out_dim: int = None,
+        context_pre_only=None,
+        pre_only=False,
+    ):
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.inner_kv_dim = self.inner_dim  # if kv_heads is None else dim_head * kv_heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.out_context_dim = query_dim
+        self.context_pre_only = context_pre_only
+        self.pre_only = pre_only
+        self.scale = dim_head**-0.5
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        if qk_norm is None:
+            self.norm_q = None
+            self.norm_k = None
+        elif qk_norm == "rms_norm":
+            self.norm_q = RMSNormFramePack(dim_head, eps=eps)
+            self.norm_k = RMSNormFramePack(dim_head, eps=eps)
+        else:
+            raise ValueError(
+                f"unknown qk_norm: {qk_norm}. Should be one of None, 'layer_norm', 'fp32_layer_norm', 'layer_norm_across_heads', 'rms_norm', 'rms_norm_across_heads', 'l2'."
+            )
+        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+        self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+        self.added_proj_bias = True  # added_proj_bias
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=True)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=True)
+            if self.context_pre_only is not None:
+                self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=True)
+        else:
+            self.add_q_proj = None
+            self.add_k_proj = None
+            self.add_v_proj = None
+        if not self.pre_only:
+            self.to_out = nn.ModuleList([])
+            self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=True))
+            # self.to_out.append(nn.Dropout(dropout))
+            self.to_out.append(nn.Identity())  # dropout=0.0
+        else:
+            self.to_out = None
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_add_out = nn.Linear(self.inner_dim, self.out_context_dim, bias=True)
+        else:
+            self.to_add_out = None
+        if qk_norm is not None and added_kv_proj_dim is not None:
+            if qk_norm == "rms_norm":
+                self.norm_added_q = RMSNormFramePack(dim_head, eps=eps)
+                self.norm_added_k = RMSNormFramePack(dim_head, eps=eps)
+            else:
+                raise ValueError(f"unknown qk_norm: {qk_norm}. Should be one of `None,'layer_norm','fp32_layer_norm','rms_norm'`")
+        else:
+            self.norm_added_q = None
+            self.norm_added_k = None
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        if processor is None:
+            processor = AttnProcessor2_0()
+        self.set_processor(processor)
+    def set_processor(self, processor: any) -> None:
+        self.processor = processor
+    def get_processor(self) -> any:
+        return self.processor
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+    def prepare_attention_mask(
+        self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
+    ) -> torch.Tensor:
+        r"""
+        Prepare the attention mask for the attention computation.
+        Args:
+            attention_mask (`torch.Tensor`):
+                The attention mask to prepare.
+            target_length (`int`):
+                The target length of the attention mask. This is the length of the attention mask after padding.
+            batch_size (`int`):
+                The batch size, which is used to repeat the attention mask.
+            out_dim (`int`, *optional*, defaults to `3`):
+                The output dimension of the attention mask. Can be either `3` or `4`.
+        Returns:
+            `torch.Tensor`: The prepared attention mask.
+        """
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0, output_size=attention_mask.shape[0] * head_size)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1, output_size=attention_mask.shape[1] * head_size)
+        return attention_mask
+class AttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        query = attn.to_q(hidden_states)
+        query_dtype = query.dtype  # store dtype before potentially deleting query
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
+        del query, key, value, attention_mask  # free memory
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query_dtype)  # use stored dtype
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        return hidden_states
+# endregion diffusers
+def pad_for_3d_conv(x, kernel_size):
+    b, c, t, h, w = x.shape
+    pt, ph, pw = kernel_size
+    pad_t = (pt - (t % pt)) % pt
+    pad_h = (ph - (h % ph)) % ph
+    pad_w = (pw - (w % pw)) % pw
+    return torch.nn.functional.pad(x, (0, pad_w, 0, pad_h, 0, pad_t), mode="replicate")
+def center_down_sample_3d(x, kernel_size):
+    # pt, ph, pw = kernel_size
+    # cp = (pt * ph * pw) // 2
+    # xp = einops.rearrange(x, 'b c (t pt) (h ph) (w pw) -> (pt ph pw) b c t h w', pt=pt, ph=ph, pw=pw)
+    # xc = xp[cp]
+    # return xc
+    return torch.nn.functional.avg_pool3d(x, kernel_size, stride=kernel_size)
+def get_cu_seqlens(text_mask, img_len):
+    batch_size = text_mask.shape[0]
+    text_len = text_mask.sum(dim=1)
+    max_len = text_mask.shape[1] + img_len
+    cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device=text_mask.device)  # ensure device match
+    for i in range(batch_size):
+        s = text_len[i] + img_len
+        s1 = i * max_len + s
+        s2 = (i + 1) * max_len
+        cu_seqlens[2 * i + 1] = s1
+        cu_seqlens[2 * i + 2] = s2
+    return cu_seqlens
+def apply_rotary_emb_transposed(x, freqs_cis):
+    cos, sin = freqs_cis.unsqueeze(-2).chunk(2, dim=-1)
+    del freqs_cis
+    x_real, x_imag = x.unflatten(-1, (-1, 2)).unbind(-1)
+    x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+    del x_real, x_imag
+    return (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+def attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, attn_mode=None, split_attn=False):
+    if cu_seqlens_q is None and cu_seqlens_kv is None and max_seqlen_q is None and max_seqlen_kv is None:
+        if attn_mode == "sageattn" or attn_mode is None and sageattn is not None:
+            x = sageattn(q, k, v, tensor_layout="NHD")
+            return x
+        if attn_mode == "flash" or attn_mode is None and flash_attn_func is not None:
+            x = flash_attn_func(q, k, v)
+            return x
+        if attn_mode == "xformers" or attn_mode is None and xformers_attn_func is not None:
+            x = xformers_attn_func(q, k, v)
+            return x
+        x = torch.nn.functional.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(
+            1, 2
+        )
+        return x
+    if split_attn:
+        if attn_mode == "sageattn" or attn_mode is None and sageattn is not None:
+            x = torch.empty_like(q)
+            for i in range(q.size(0)):
+                x[i : i + 1] = sageattn(q[i : i + 1], k[i : i + 1], v[i : i + 1], tensor_layout="NHD")
+            return x
+        if attn_mode == "flash" or attn_mode is None and flash_attn_func is not None:
+            x = torch.empty_like(q)
+            for i in range(q.size(0)):
+                x[i : i + 1] = flash_attn_func(q[i : i + 1], k[i : i + 1], v[i : i + 1])
+            return x
+        if attn_mode == "xformers" or attn_mode is None and xformers_attn_func is not None:
+            x = torch.empty_like(q)
+            for i in range(q.size(0)):
+                x[i : i + 1] = xformers_attn_func(q[i : i + 1], k[i : i + 1], v[i : i + 1])
+            return x
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        x = torch.empty_like(q)
+        for i in range(q.size(0)):
+            x[i : i + 1] = torch.nn.functional.scaled_dot_product_attention(q[i : i + 1], k[i : i + 1], v[i : i + 1])
+        x = x.transpose(1, 2)
+        return x
+    batch_size = q.shape[0]
+    q = q.view(q.shape[0] * q.shape[1], *q.shape[2:])
+    k = k.view(k.shape[0] * k.shape[1], *k.shape[2:])
+    v = v.view(v.shape[0] * v.shape[1], *v.shape[2:])
+    if attn_mode == "sageattn" or attn_mode is None and sageattn_varlen is not None:
+        x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+        del q, k, v  # free memory
+    elif attn_mode == "flash" or attn_mode is None and flash_attn_varlen_func is not None:
+        x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
+        del q, k, v  # free memory
+    else:
+        raise NotImplementedError("No Attn Installed or batch_size > 1 is not supported in this configuration. Try `--split_attn`.")
+    x = x.view(batch_size, max_seqlen_q, *x.shape[2:])
+    return x
+class HunyuanAttnProcessorFlashAttnDouble:
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask,
+        image_rotary_emb,
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ):
+        cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv = attention_mask
+        # Project image latents
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        del hidden_states  # free memory
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+        query = apply_rotary_emb_transposed(query, image_rotary_emb)
+        key = apply_rotary_emb_transposed(key, image_rotary_emb)
+        del image_rotary_emb  # free memory
+        # Project context (text/encoder) embeddings
+        encoder_query = attn.add_q_proj(encoder_hidden_states)
+        encoder_key = attn.add_k_proj(encoder_hidden_states)
+        encoder_value = attn.add_v_proj(encoder_hidden_states)
+        txt_length = encoder_hidden_states.shape[1]  # store length before deleting
+        del encoder_hidden_states  # free memory
+        encoder_query = encoder_query.unflatten(2, (attn.heads, -1))
+        encoder_key = encoder_key.unflatten(2, (attn.heads, -1))
+        encoder_value = encoder_value.unflatten(2, (attn.heads, -1))
+        encoder_query = attn.norm_added_q(encoder_query)
+        encoder_key = attn.norm_added_k(encoder_key)
+        # Concatenate image and context q, k, v
+        query = torch.cat([query, encoder_query], dim=1)
+        key = torch.cat([key, encoder_key], dim=1)
+        value = torch.cat([value, encoder_value], dim=1)
+        del encoder_query, encoder_key, encoder_value  # free memory
+        hidden_states_attn = attn_varlen_func(
+            query, key, value, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, attn_mode=attn_mode, split_attn=split_attn
+        )
+        del query, key, value  # free memory
+        hidden_states_attn = hidden_states_attn.flatten(-2)
+        hidden_states, encoder_hidden_states = hidden_states_attn[:, :-txt_length], hidden_states_attn[:, -txt_length:]
+        del hidden_states_attn  # free memory
+        # Apply output projections
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)  # Dropout/Identity
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        return hidden_states, encoder_hidden_states
+class HunyuanAttnProcessorFlashAttnSingle:
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask,
+        image_rotary_emb,
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ):
+        cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv = attention_mask
+        txt_length = encoder_hidden_states.shape[1]  # Store text length
+        # Concatenate image and context inputs
+        hidden_states_cat = torch.cat([hidden_states, encoder_hidden_states], dim=1)
+        del hidden_states, encoder_hidden_states  # free memory
+        # Project concatenated inputs
+        query = attn.to_q(hidden_states_cat)
+        key = attn.to_k(hidden_states_cat)
+        value = attn.to_v(hidden_states_cat)
+        del hidden_states_cat  # free memory
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+        query = torch.cat([apply_rotary_emb_transposed(query[:, :-txt_length], image_rotary_emb), query[:, -txt_length:]], dim=1)
+        key = torch.cat([apply_rotary_emb_transposed(key[:, :-txt_length], image_rotary_emb), key[:, -txt_length:]], dim=1)
+        del image_rotary_emb  # free memory
+        hidden_states = attn_varlen_func(
+            query, key, value, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, attn_mode=attn_mode, split_attn=split_attn
+        )
+        del query, key, value  # free memory
+        hidden_states = hidden_states.flatten(-2)
+        hidden_states, encoder_hidden_states = hidden_states[:, :-txt_length], hidden_states[:, -txt_length:]
+        return hidden_states, encoder_hidden_states
+class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+    def forward(self, timestep, guidance, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))
+        guidance_proj = self.time_proj(guidance)
+        guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype))
+        time_guidance_emb = timesteps_emb + guidance_emb
+        pooled_projections = self.text_embedder(pooled_projection)
+        conditioning = time_guidance_emb + pooled_projections
+        return conditioning
+class CombinedTimestepTextProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+    def forward(self, timestep, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))
+        pooled_projections = self.text_embedder(pooled_projection)
+        conditioning = timesteps_emb + pooled_projections
+        return conditioning
+class HunyuanVideoAdaNorm(nn.Module):
+    def __init__(self, in_features: int, out_features: Optional[int] = None) -> None:
+        super().__init__()
+        out_features = out_features or 2 * in_features
+        self.linear = nn.Linear(in_features, out_features)
+        self.nonlinearity = nn.SiLU()
+    def forward(self, temb: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        temb = self.linear(self.nonlinearity(temb))
+        gate_msa, gate_mlp = temb.chunk(2, dim=-1)
+        gate_msa, gate_mlp = gate_msa.unsqueeze(1), gate_mlp.unsqueeze(1)
+        return gate_msa, gate_mlp
+class HunyuanVideoIndividualTokenRefinerBlock(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        attention_bias: bool = True,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        self.norm1 = LayerNormFramePack(hidden_size, elementwise_affine=True, eps=1e-6)
+        self.attn = Attention(
+            query_dim=hidden_size,
+            cross_attention_dim=None,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            bias=attention_bias,
+        )
+        self.norm2 = LayerNormFramePack(hidden_size, elementwise_affine=True, eps=1e-6)
+        self.ff = FeedForward(hidden_size, mult=mlp_width_ratio, activation_fn="linear-silu", dropout=mlp_drop_rate)
+        self.norm_out = HunyuanVideoAdaNorm(hidden_size, 2 * hidden_size)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        norm_hidden_states = self.norm1(hidden_states)
+        # Self-attention
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=attention_mask,
+        )
+        del norm_hidden_states  # free memory
+        gate_msa, gate_mlp = self.norm_out(temb)
+        hidden_states = hidden_states + attn_output * gate_msa
+        del attn_output, gate_msa  # free memory
+        ff_output = self.ff(self.norm2(hidden_states))
+        hidden_states = hidden_states + ff_output * gate_mlp
+        del ff_output, gate_mlp  # free memory
+        return hidden_states
+class HunyuanVideoIndividualTokenRefiner(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_layers: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        attention_bias: bool = True,
+    ) -> None:
+        super().__init__()
+        self.refiner_blocks = nn.ModuleList(
+            [
+                HunyuanVideoIndividualTokenRefinerBlock(
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_drop_rate=mlp_drop_rate,
+                    attention_bias=attention_bias,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        self_attn_mask = None
+        if attention_mask is not None:
+            batch_size = attention_mask.shape[0]
+            seq_len = attention_mask.shape[1]
+            attention_mask = attention_mask.to(hidden_states.device).bool()
+            self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
+            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
+            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
+            self_attn_mask[:, :, :, 0] = True
+        for block in self.refiner_blocks:
+            hidden_states = block(hidden_states, temb, self_attn_mask)
+        return hidden_states
+class HunyuanVideoTokenRefiner(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_layers: int,
+        mlp_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        attention_bias: bool = True,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        self.time_text_embed = CombinedTimestepTextProjEmbeddings(embedding_dim=hidden_size, pooled_projection_dim=in_channels)
+        self.proj_in = nn.Linear(in_channels, hidden_size, bias=True)
+        self.token_refiner = HunyuanVideoIndividualTokenRefiner(
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=attention_head_dim,
+            num_layers=num_layers,
+            mlp_width_ratio=mlp_ratio,
+            mlp_drop_rate=mlp_drop_rate,
+            attention_bias=attention_bias,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        if attention_mask is None:
+            pooled_projections = hidden_states.mean(dim=1)
+        else:
+            original_dtype = hidden_states.dtype
+            mask_float = attention_mask.float().unsqueeze(-1)
+            pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1)
+            pooled_projections = pooled_projections.to(original_dtype)
+        temb = self.time_text_embed(timestep, pooled_projections)
+        del pooled_projections  # free memory
+        hidden_states = self.proj_in(hidden_states)
+        hidden_states = self.token_refiner(hidden_states, temb, attention_mask)
+        del temb, attention_mask  # free memory
+        return hidden_states
+class HunyuanVideoRotaryPosEmbed(nn.Module):
+    def __init__(self, rope_dim, theta):
+        super().__init__()
+        self.DT, self.DY, self.DX = rope_dim
+        self.theta = theta
+        self.h_w_scaling_factor = 1.0
+    @torch.no_grad()
+    def get_frequency(self, dim, pos):
+        T, H, W = pos.shape
+        freqs = 1.0 / (self.theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device)[: (dim // 2)] / dim))
+        freqs = torch.outer(freqs, pos.reshape(-1)).unflatten(-1, (T, H, W)).repeat_interleave(2, dim=0)
+        return freqs.cos(), freqs.sin()
+    @torch.no_grad()
+    def forward_inner(self, frame_indices, height, width, device):
+        GT, GY, GX = torch.meshgrid(
+            frame_indices.to(device=device, dtype=torch.float32),
+            torch.arange(0, height, device=device, dtype=torch.float32) * self.h_w_scaling_factor,
+            torch.arange(0, width, device=device, dtype=torch.float32) * self.h_w_scaling_factor,
+            indexing="ij",
+        )
+        FCT, FST = self.get_frequency(self.DT, GT)
+        del GT  # free memory
+        FCY, FSY = self.get_frequency(self.DY, GY)
+        del GY  # free memory
+        FCX, FSX = self.get_frequency(self.DX, GX)
+        del GX  # free memory
+        result = torch.cat([FCT, FCY, FCX, FST, FSY, FSX], dim=0)
+        del FCT, FCY, FCX, FST, FSY, FSX  # free memory
+        # Return result already on the correct device
+        return result  # Shape (2 * total_dim / 2, T, H, W) -> (total_dim, T, H, W)
+    @torch.no_grad()
+    def forward(self, frame_indices, height, width, device):
+        frame_indices = frame_indices.unbind(0)
+        results = [self.forward_inner(f, height, width, device) for f in frame_indices]
+        results = torch.stack(results, dim=0)
+        return results
+class AdaLayerNormZero(nn.Module):
+    def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNormFramePack(embedding_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward(
+        self, x: torch.Tensor, emb: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        emb = emb.unsqueeze(-2)
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=-1)
+        x = self.norm(x) * (1 + scale_msa) + shift_msa
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+class AdaLayerNormZeroSingle(nn.Module):
+    def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 3 * embedding_dim, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNormFramePack(embedding_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        emb = emb.unsqueeze(-2)
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=-1)
+        x = self.norm(x) * (1 + scale_msa) + shift_msa
+        return x, gate_msa
+class AdaLayerNormContinuous(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNormFramePack(embedding_dim, eps, elementwise_affine, bias)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        emb = emb.unsqueeze(-2)
+        emb = self.linear(self.silu(emb))
+        scale, shift = emb.chunk(2, dim=-1)
+        del emb  # free memory
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+class HunyuanVideoSingleTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float = 4.0,
+        qk_norm: str = "rms_norm",
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        mlp_dim = int(hidden_size * mlp_ratio)
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+        # Attention layer (pre_only=True means no output projection in Attention module itself)
+        self.attn = Attention(
+            query_dim=hidden_size,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=hidden_size,
+            bias=True,
+            processor=HunyuanAttnProcessorFlashAttnSingle(),
+            qk_norm=qk_norm,
+            eps=1e-6,
+            pre_only=True,  # Crucial: Attn processor will return raw attention output
+        )
+        self.norm = AdaLayerNormZeroSingle(hidden_size, norm_type="layer_norm")
+        self.proj_mlp = nn.Linear(hidden_size, mlp_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(hidden_size + mlp_dim, hidden_size)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.shape[1]
+        hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
+        del encoder_hidden_states  # free memory
+        residual = hidden_states
+        # 1. Input normalization
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        norm_hidden_states, norm_encoder_hidden_states = (
+            norm_hidden_states[:, :-text_seq_length, :],
+            norm_hidden_states[:, -text_seq_length:, :],
+        )
+        # 2. Attention
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            attention_mask=attention_mask,
+            image_rotary_emb=image_rotary_emb,
+            attn_mode=self.attn_mode,
+            split_attn=self.split_attn,
+        )
+        attn_output = torch.cat([attn_output, context_attn_output], dim=1)
+        del norm_hidden_states, norm_encoder_hidden_states, context_attn_output  # free memory
+        del image_rotary_emb
+        # 3. Modulation and residual connection
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        del attn_output, mlp_hidden_states  # free memory
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = hidden_states + residual
+        hidden_states, encoder_hidden_states = (
+            hidden_states[:, :-text_seq_length, :],
+            hidden_states[:, -text_seq_length:, :],
+        )
+        return hidden_states, encoder_hidden_states
+class HunyuanVideoTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float,
+        qk_norm: str = "rms_norm",
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        self.attn_mode = attn_mode
+        self.split_attn = split_attn
+        self.norm1 = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
+        self.norm1_context = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
+        self.attn = Attention(
+            query_dim=hidden_size,
+            cross_attention_dim=None,
+            added_kv_proj_dim=hidden_size,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=hidden_size,
+            context_pre_only=False,
+            bias=True,
+            processor=HunyuanAttnProcessorFlashAttnDouble(),
+            qk_norm=qk_norm,
+            eps=1e-6,
+        )
+        self.norm2 = LayerNormFramePack(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
+        self.norm2_context = LayerNormFramePack(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # 1. Input normalization
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        # 2. Joint attention
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            attention_mask=attention_mask,
+            image_rotary_emb=freqs_cis,
+            attn_mode=self.attn_mode,
+            split_attn=self.split_attn,
+        )
+        del norm_hidden_states, norm_encoder_hidden_states, freqs_cis  # free memory
+        # 3. Modulation and residual connection
+        hidden_states = hidden_states + attn_output * gate_msa
+        del attn_output, gate_msa  # free memory
+        encoder_hidden_states = encoder_hidden_states + context_attn_output * c_gate_msa
+        del context_attn_output, c_gate_msa  # free memory
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        del shift_mlp, scale_mlp  # free memory
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp) + c_shift_mlp
+        del c_shift_mlp, c_scale_mlp  # free memory
+        # 4. Feed-forward
+        ff_output = self.ff(norm_hidden_states)
+        del norm_hidden_states  # free memory
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        del norm_encoder_hidden_states  # free memory
+        hidden_states = hidden_states + gate_mlp * ff_output
+        del ff_output, gate_mlp  # free memory
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp * context_ff_output
+        del context_ff_output, c_gate_mlp  # free memory
+        return hidden_states, encoder_hidden_states
+class ClipVisionProjection(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.up = nn.Linear(in_channels, out_channels * 3)
+        self.down = nn.Linear(out_channels * 3, out_channels)
+    def forward(self, x):
+        projected_x = self.down(nn.functional.silu(self.up(x)))
+        return projected_x
+class HunyuanVideoPatchEmbed(nn.Module):
+    def __init__(self, patch_size, in_chans, embed_dim):
+        super().__init__()
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+class HunyuanVideoPatchEmbedForCleanLatents(nn.Module):
+    def __init__(self, inner_dim):
+        super().__init__()
+        self.proj = nn.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
+        self.proj_2x = nn.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
+        self.proj_4x = nn.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
+    @torch.no_grad()
+    def initialize_weight_from_another_conv3d(self, another_layer):
+        weight = another_layer.weight.detach().clone()
+        bias = another_layer.bias.detach().clone()
+        sd = {
+            "proj.weight": weight.clone(),
+            "proj.bias": bias.clone(),
+            "proj_2x.weight": einops.repeat(weight, "b c t h w -> b c (t tk) (h hk) (w wk)", tk=2, hk=2, wk=2) / 8.0,
+            "proj_2x.bias": bias.clone(),
+            "proj_4x.weight": einops.repeat(weight, "b c t h w -> b c (t tk) (h hk) (w wk)", tk=4, hk=4, wk=4) / 64.0,
+            "proj_4x.bias": bias.clone(),
+        }
+        sd = {k: v.clone() for k, v in sd.items()}
+        self.load_state_dict(sd)
+        return
+class HunyuanVideoTransformer3DModelPacked(nn.Module):  # (PreTrainedModelMixin, GenerationMixin,
+    # ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    # @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        num_attention_heads: int = 24,
+        attention_head_dim: int = 128,
+        num_layers: int = 20,
+        num_single_layers: int = 40,
+        num_refiner_layers: int = 2,
+        mlp_ratio: float = 4.0,
+        patch_size: int = 2,
+        patch_size_t: int = 1,
+        qk_norm: str = "rms_norm",
+        guidance_embeds: bool = True,
+        text_embed_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        rope_theta: float = 256.0,
+        rope_axes_dim: Tuple[int] = (16, 56, 56),
+        has_image_proj=False,
+        image_proj_dim=1152,
+        has_clean_x_embedder=False,
+        attn_mode: Optional[str] = None,
+        split_attn: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        out_channels = out_channels or in_channels
+        self.config_patch_size = patch_size
+        self.config_patch_size_t = patch_size_t
+        # 1. Latent and condition embedders
+        self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
+        self.context_embedder = HunyuanVideoTokenRefiner(
+            text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
+        )
+        self.time_text_embed = CombinedTimestepGuidanceTextProjEmbeddings(inner_dim, pooled_projection_dim)
+        self.clean_x_embedder = None
+        self.image_projection = None
+        # 2. RoPE
+        self.rope = HunyuanVideoRotaryPosEmbed(rope_axes_dim, rope_theta)
+        # 3. Dual stream transformer blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                HunyuanVideoTransformerBlock(
+                    num_attention_heads,
+                    attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    qk_norm=qk_norm,
+                    attn_mode=attn_mode,
+                    split_attn=split_attn,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # 4. Single stream transformer blocks
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                HunyuanVideoSingleTransformerBlock(
+                    num_attention_heads,
+                    attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    qk_norm=qk_norm,
+                    attn_mode=attn_mode,
+                    split_attn=split_attn,
+                )
+                for _ in range(num_single_layers)
+            ]
+        )
+        # 5. Output projection
+        self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
+        self.inner_dim = inner_dim
+        self.use_gradient_checkpointing = False
+        self.enable_teacache = False
+        # if has_image_proj:
+        #     self.install_image_projection(image_proj_dim)
+        self.image_projection = ClipVisionProjection(in_channels=image_proj_dim, out_channels=self.inner_dim)
+        # self.config["has_image_proj"] = True
+        # self.config["image_proj_dim"] = in_channels
+        # if has_clean_x_embedder:
+        #     self.install_clean_x_embedder()
+        self.clean_x_embedder = HunyuanVideoPatchEmbedForCleanLatents(self.inner_dim)
+        # self.config["has_clean_x_embedder"] = True
+        self.high_quality_fp32_output_for_inference = True  # False # change default to True
+        # Block swapping attributes (initialized to None)
+        self.blocks_to_swap = None
+        self.offloader_double = None
+        self.offloader_single = None
+        # RoPE scaling
+        self.rope_scaling_timestep_threshold: Optional[int] = None  # scale RoPE above this timestep
+        self.rope_scaling_factor: float = 1.0  # RoPE scaling factor
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    def enable_gradient_checkpointing(self):
+        self.use_gradient_checkpointing = True
+        print("Gradient checkpointing enabled for HunyuanVideoTransformer3DModelPacked.")  # Logging
+    def disable_gradient_checkpointing(self):
+        self.use_gradient_checkpointing = False
+        print("Gradient checkpointing disabled for HunyuanVideoTransformer3DModelPacked.")  # Logging
+    def initialize_teacache(self, enable_teacache=True, num_steps=25, rel_l1_thresh=0.15):
+        self.enable_teacache = enable_teacache
+        self.cnt = 0
+        self.num_steps = num_steps
+        self.rel_l1_thresh = rel_l1_thresh  # 0.1 for 1.6x speedup, 0.15 for 2.1x speedup
+        self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = None
+        self.previous_residual = None
+        self.teacache_rescale_func = np.poly1d([7.33226126e02, -4.01131952e02, 6.75869174e01, -3.14987800e00, 9.61237896e-02])
+        if enable_teacache:
+            print(f"TeaCache enabled: num_steps={num_steps}, rel_l1_thresh={rel_l1_thresh}")
+        else:
+            print("TeaCache disabled.")
+    def gradient_checkpointing_method(self, block, *args):
+        if self.use_gradient_checkpointing:
+            result = torch.utils.checkpoint.checkpoint(block, *args, use_reentrant=False)
+        else:
+            result = block(*args)
+        return result
+    def enable_block_swap(self, num_blocks: int, device: torch.device, supports_backward: bool):
+        self.blocks_to_swap = num_blocks
+        self.num_double_blocks = len(self.transformer_blocks)
+        self.num_single_blocks = len(self.single_transformer_blocks)
+        double_blocks_to_swap = num_blocks // 2
+        single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2 + 1
+        assert double_blocks_to_swap <= self.num_double_blocks - 1 and single_blocks_to_swap <= self.num_single_blocks - 1, (
+            f"Cannot swap more than {self.num_double_blocks - 1} double blocks and {self.num_single_blocks - 1} single blocks. "
+            f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
+        )
+        self.offloader_double = ModelOffloader(
+            "double",
+            self.transformer_blocks,
+            self.num_double_blocks,
+            double_blocks_to_swap,
+            supports_backward,
+            device,
+            # debug=True # Optional debugging
+        )
+        self.offloader_single = ModelOffloader(
+            "single",
+            self.single_transformer_blocks,
+            self.num_single_blocks,
+            single_blocks_to_swap,
+            supports_backward,
+            device,  # , debug=True
+        )
+        print(
+            f"HunyuanVideoTransformer3DModelPacked: Block swap enabled. Swapping {num_blocks} blocks, "
+            + f"double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}, supports_backward: {supports_backward}."
+        )
+    def switch_block_swap_for_inference(self):
+        if self.blocks_to_swap and self.blocks_to_swap > 0:
+            self.offloader_double.set_forward_only(True)
+            self.offloader_single.set_forward_only(True)
+            self.prepare_block_swap_before_forward()
+            print(f"HunyuanVideoTransformer3DModelPacked: Block swap set to forward only.")
+    def switch_block_swap_for_training(self):
+        if self.blocks_to_swap and self.blocks_to_swap > 0:
+            self.offloader_double.set_forward_only(False)
+            self.offloader_single.set_forward_only(False)
+            self.prepare_block_swap_before_forward()
+            print(f"HunyuanVideoTransformer3DModelPacked: Block swap set to forward and backward.")
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
+        if self.blocks_to_swap:
+            saved_double_blocks = self.transformer_blocks
+            saved_single_blocks = self.single_transformer_blocks
+            self.transformer_blocks = None
+            self.single_transformer_blocks = None
+        self.to(device)
+        if self.blocks_to_swap:
+            self.transformer_blocks = saved_double_blocks
+            self.single_transformer_blocks = saved_single_blocks
+    def prepare_block_swap_before_forward(self):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self.offloader_double.prepare_block_devices_before_forward(self.transformer_blocks)
+        self.offloader_single.prepare_block_devices_before_forward(self.single_transformer_blocks)
+    def enable_rope_scaling(self, timestep_threshold: Optional[int], rope_scaling_factor: float = 1.0):
+        if timestep_threshold is not None and rope_scaling_factor > 0:
+            self.rope_scaling_timestep_threshold = timestep_threshold
+            self.rope_scaling_factor = rope_scaling_factor
+            logger.info(f"RoPE scaling enabled: threshold={timestep_threshold}, scaling_factor={rope_scaling_factor}.")
+        else:
+            self.rope_scaling_timestep_threshold = None
+            self.rope_scaling_factor = 1.0
+            self.rope.h_w_scaling_factor = 1.0  # reset to default
+            logger.info("RoPE scaling disabled.")
+    def process_input_hidden_states(
+        self,
+        latents,
+        latent_indices=None,
+        clean_latents=None,
+        clean_latent_indices=None,
+        clean_latents_2x=None,
+        clean_latent_2x_indices=None,
+        clean_latents_4x=None,
+        clean_latent_4x_indices=None,
+    ):
+        hidden_states = self.gradient_checkpointing_method(self.x_embedder.proj, latents)
+        B, C, T, H, W = hidden_states.shape
+        if latent_indices is None:
+            latent_indices = torch.arange(0, T).unsqueeze(0).expand(B, -1)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        rope_freqs = self.rope(frame_indices=latent_indices, height=H, width=W, device=hidden_states.device)
+        rope_freqs = rope_freqs.flatten(2).transpose(1, 2)
+        if clean_latents is not None and clean_latent_indices is not None:
+            clean_latents = clean_latents.to(hidden_states)
+            clean_latents = self.gradient_checkpointing_method(self.clean_x_embedder.proj, clean_latents)
+            clean_latents = clean_latents.flatten(2).transpose(1, 2)
+            clean_latent_rope_freqs = self.rope(frame_indices=clean_latent_indices, height=H, width=W, device=clean_latents.device)
+            clean_latent_rope_freqs = clean_latent_rope_freqs.flatten(2).transpose(1, 2)
+            hidden_states = torch.cat([clean_latents, hidden_states], dim=1)
+            rope_freqs = torch.cat([clean_latent_rope_freqs, rope_freqs], dim=1)
+        if clean_latents_2x is not None and clean_latent_2x_indices is not None:
+            clean_latents_2x = clean_latents_2x.to(hidden_states)
+            clean_latents_2x = pad_for_3d_conv(clean_latents_2x, (2, 4, 4))
+            clean_latents_2x = self.gradient_checkpointing_method(self.clean_x_embedder.proj_2x, clean_latents_2x)
+            clean_latents_2x = clean_latents_2x.flatten(2).transpose(1, 2)
+            clean_latent_2x_rope_freqs = self.rope(
+                frame_indices=clean_latent_2x_indices, height=H, width=W, device=clean_latents_2x.device
+            )
+            clean_latent_2x_rope_freqs = pad_for_3d_conv(clean_latent_2x_rope_freqs, (2, 2, 2))
+            clean_latent_2x_rope_freqs = center_down_sample_3d(clean_latent_2x_rope_freqs, (2, 2, 2))
+            clean_latent_2x_rope_freqs = clean_latent_2x_rope_freqs.flatten(2).transpose(1, 2)
+            hidden_states = torch.cat([clean_latents_2x, hidden_states], dim=1)
+            rope_freqs = torch.cat([clean_latent_2x_rope_freqs, rope_freqs], dim=1)
+        if clean_latents_4x is not None and clean_latent_4x_indices is not None:
+            clean_latents_4x = clean_latents_4x.to(hidden_states)
+            clean_latents_4x = pad_for_3d_conv(clean_latents_4x, (4, 8, 8))
+            clean_latents_4x = self.gradient_checkpointing_method(self.clean_x_embedder.proj_4x, clean_latents_4x)
+            clean_latents_4x = clean_latents_4x.flatten(2).transpose(1, 2)
+            clean_latent_4x_rope_freqs = self.rope(
+                frame_indices=clean_latent_4x_indices, height=H, width=W, device=clean_latents_4x.device
+            )
+            clean_latent_4x_rope_freqs = pad_for_3d_conv(clean_latent_4x_rope_freqs, (4, 4, 4))
+            clean_latent_4x_rope_freqs = center_down_sample_3d(clean_latent_4x_rope_freqs, (4, 4, 4))
+            clean_latent_4x_rope_freqs = clean_latent_4x_rope_freqs.flatten(2).transpose(1, 2)
+            hidden_states = torch.cat([clean_latents_4x, hidden_states], dim=1)
+            rope_freqs = torch.cat([clean_latent_4x_rope_freqs, rope_freqs], dim=1)
+        return hidden_states, rope_freqs
+    def forward(
+        self,
+        hidden_states,
+        timestep,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        pooled_projections,
+        guidance,
+        latent_indices=None,
+        clean_latents=None,
+        clean_latent_indices=None,
+        clean_latents_2x=None,
+        clean_latent_2x_indices=None,
+        clean_latents_4x=None,
+        clean_latent_4x_indices=None,
+        image_embeddings=None,
+        attention_kwargs=None,
+        return_dict=True,
+    ):
+        if attention_kwargs is None:
+            attention_kwargs = {}
+        # RoPE scaling: must be done before processing hidden states
+        if self.rope_scaling_timestep_threshold is not None:
+            if timestep >= self.rope_scaling_timestep_threshold:
+                self.rope.h_w_scaling_factor = self.rope_scaling_factor
+            else:
+                self.rope.h_w_scaling_factor = 1.0
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p, p_t = self.config_patch_size, self.config_patch_size_t
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p
+        post_patch_width = width // p
+        original_context_length = post_patch_num_frames * post_patch_height * post_patch_width
+        hidden_states, rope_freqs = self.process_input_hidden_states(
+            hidden_states,
+            latent_indices,
+            clean_latents,
+            clean_latent_indices,
+            clean_latents_2x,
+            clean_latent_2x_indices,
+            clean_latents_4x,
+            clean_latent_4x_indices,
+        )
+        del (
+            latent_indices,
+            clean_latents,
+            clean_latent_indices,
+            clean_latents_2x,
+            clean_latent_2x_indices,
+            clean_latents_4x,
+            clean_latent_4x_indices,
+        )  # free memory
+        temb = self.gradient_checkpointing_method(self.time_text_embed, timestep, guidance, pooled_projections)
+        encoder_hidden_states = self.gradient_checkpointing_method(
+            self.context_embedder, encoder_hidden_states, timestep, encoder_attention_mask
+        )
+        if self.image_projection is not None:
+            assert image_embeddings is not None, "You must use image embeddings!"
+            extra_encoder_hidden_states = self.gradient_checkpointing_method(self.image_projection, image_embeddings)
+            extra_attention_mask = torch.ones(
+                (batch_size, extra_encoder_hidden_states.shape[1]),
+                dtype=encoder_attention_mask.dtype,
+                device=encoder_attention_mask.device,
+            )
+            # must cat before (not after) encoder_hidden_states, due to attn masking
+            encoder_hidden_states = torch.cat([extra_encoder_hidden_states, encoder_hidden_states], dim=1)
+            encoder_attention_mask = torch.cat([extra_attention_mask, encoder_attention_mask], dim=1)
+            del extra_encoder_hidden_states, extra_attention_mask  # free memory
+        with torch.no_grad():
+            if batch_size == 1:
+                # When batch size is 1, we do not need any masks or var-len funcs since cropping is mathematically same to what we want
+                # If they are not same, then their impls are wrong. Ours are always the correct one.
+                text_len = encoder_attention_mask.sum().item()
+                encoder_hidden_states = encoder_hidden_states[:, :text_len]
+                attention_mask = None, None, None, None
+            else:
+                img_seq_len = hidden_states.shape[1]
+                txt_seq_len = encoder_hidden_states.shape[1]
+                cu_seqlens_q = get_cu_seqlens(encoder_attention_mask, img_seq_len)
+                cu_seqlens_kv = cu_seqlens_q
+                max_seqlen_q = img_seq_len + txt_seq_len
+                max_seqlen_kv = max_seqlen_q
+                attention_mask = cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv
+                del cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv  # free memory
+        del encoder_attention_mask  # free memory
+        if self.enable_teacache:
+            modulated_inp = self.transformer_blocks[0].norm1(hidden_states, emb=temb)[0]
+            if self.cnt == 0 or self.cnt == self.num_steps - 1:
+                should_calc = True
+                self.accumulated_rel_l1_distance = 0
+            else:
+                curr_rel_l1 = (
+                    ((modulated_inp - self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean())
+                    .cpu()
+                    .item()
+                )
+                self.accumulated_rel_l1_distance += self.teacache_rescale_func(curr_rel_l1)
+                should_calc = self.accumulated_rel_l1_distance >= self.rel_l1_thresh
+                if should_calc:
+                    self.accumulated_rel_l1_distance = 0
+            self.previous_modulated_input = modulated_inp
+            self.cnt += 1
+            if self.cnt == self.num_steps:
+                self.cnt = 0
+            if not should_calc:
+                hidden_states = hidden_states + self.previous_residual
+            else:
+                ori_hidden_states = hidden_states.clone()
+                for block_id, block in enumerate(self.transformer_blocks):
+                    hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
+                        block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
+                    )
+                for block_id, block in enumerate(self.single_transformer_blocks):
+                    hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
+                        block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
+                    )
+                self.previous_residual = hidden_states - ori_hidden_states
+                del ori_hidden_states  # free memory
+        else:
+            for block_id, block in enumerate(self.transformer_blocks):
+                if self.blocks_to_swap:
+                    self.offloader_double.wait_for_block(block_id)
+                hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
+                    block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
+                )
+                if self.blocks_to_swap:
+                    self.offloader_double.submit_move_blocks_forward(self.transformer_blocks, block_id)
+            for block_id, block in enumerate(self.single_transformer_blocks):
+                if self.blocks_to_swap:
+                    self.offloader_single.wait_for_block(block_id)
+                hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
+                    block, hidden_states, encoder_hidden_states, temb, attention_mask, rope_freqs
+                )
+                if self.blocks_to_swap:
+                    self.offloader_single.submit_move_blocks_forward(self.single_transformer_blocks, block_id)
+        del attention_mask, rope_freqs  # free memory
+        del encoder_hidden_states  # free memory
+        hidden_states = self.gradient_checkpointing_method(self.norm_out, hidden_states, temb)
+        hidden_states = hidden_states[:, -original_context_length:, :]
+        if self.high_quality_fp32_output_for_inference:
+            hidden_states = hidden_states.to(dtype=torch.float32)
+            if self.proj_out.weight.dtype != torch.float32:
+                self.proj_out.to(dtype=torch.float32)
+        hidden_states = self.gradient_checkpointing_method(self.proj_out, hidden_states)
+        hidden_states = einops.rearrange(
+            hidden_states,
+            "b (t h w) (c pt ph pw) -> b c (t pt) (h ph) (w pw)",
+            t=post_patch_num_frames,
+            h=post_patch_height,
+            w=post_patch_width,
+            pt=p_t,
+            ph=p,
+            pw=p,
+        )
+        if return_dict:
+            # return Transformer2DModelOutput(sample=hidden_states)
+            return SimpleNamespace(sample=hidden_states)
+        return (hidden_states,)
+    def fp8_optimization(
+        self, state_dict: dict[str, torch.Tensor], device: torch.device, move_to_device: bool, use_scaled_mm: bool = False
+    ) -> dict[str, torch.Tensor]:  # Return type hint added
+        """
+        Optimize the model state_dict with fp8.
+        Args:
+            state_dict (dict[str, torch.Tensor]):
+                The state_dict of the model.
+            device (torch.device):
+                The device to calculate the weight.
+            move_to_device (bool):
+                Whether to move the weight to the device after optimization.
+            use_scaled_mm (bool):
+                Whether to use scaled matrix multiplication for FP8.
+        """
+        TARGET_KEYS = ["transformer_blocks", "single_transformer_blocks"]
+        EXCLUDE_KEYS = ["norm"]  # Exclude norm layers (e.g., LayerNorm, RMSNorm) from FP8
+        # inplace optimization
+        state_dict = optimize_state_dict_with_fp8(state_dict, device, TARGET_KEYS, EXCLUDE_KEYS, move_to_device=move_to_device)
+        # apply monkey patching
+        apply_fp8_monkey_patch(self, state_dict, use_scaled_mm=use_scaled_mm)
+        return state_dict
+def load_packed_model(
+    device: Union[str, torch.device],
+    dit_path: str,
+    attn_mode: str,
+    loading_device: Union[str, torch.device],
+    fp8_scaled: bool = False,
+    split_attn: bool = False,
+) -> HunyuanVideoTransformer3DModelPacked:
+    # TODO support split_attn
+    device = torch.device(device)
+    loading_device = torch.device(loading_device)
+    if os.path.isdir(dit_path):
+        # we don't support from_pretrained for now, so loading safetensors directly
+        safetensor_files = glob.glob(os.path.join(dit_path, "*.safetensors"))
+        if len(safetensor_files) == 0:
+            raise ValueError(f"Cannot find safetensors file in {dit_path}")
+        # sort by name and take the first one
+        safetensor_files.sort()
+        dit_path = safetensor_files[0]
+    with init_empty_weights():
+        logger.info(f"Creating HunyuanVideoTransformer3DModelPacked")
+        model = HunyuanVideoTransformer3DModelPacked(
+            attention_head_dim=128,
+            guidance_embeds=True,
+            has_clean_x_embedder=True,
+            has_image_proj=True,
+            image_proj_dim=1152,
+            in_channels=16,
+            mlp_ratio=4.0,
+            num_attention_heads=24,
+            num_layers=20,
+            num_refiner_layers=2,
+            num_single_layers=40,
+            out_channels=16,
+            patch_size=2,
+            patch_size_t=1,
+            pooled_projection_dim=768,
+            qk_norm="rms_norm",
+            rope_axes_dim=(16, 56, 56),
+            rope_theta=256.0,
+            text_embed_dim=4096,
+            attn_mode=attn_mode,
+            split_attn=split_attn,
+        )
+    # if fp8_scaled, load model weights to CPU to reduce VRAM usage. Otherwise, load to the specified device (CPU for block swap or CUDA for others)
+    dit_loading_device = torch.device("cpu") if fp8_scaled else loading_device
+    logger.info(f"Loading DiT model from {dit_path}, device={dit_loading_device}")
+    # load model weights with the specified dtype or as is
+    sd = load_split_weights(dit_path, device=dit_loading_device, disable_mmap=True)
+    if fp8_scaled:
+        # fp8 optimization: calculate on CUDA, move back to CPU if loading_device is CPU (block swap)
+        logger.info(f"Optimizing model weights to fp8. This may take a while.")
+        sd = model.fp8_optimization(sd, device, move_to_device=loading_device.type == "cpu")
+        if loading_device.type != "cpu":
+            # make sure all the model weights are on the loading_device
+            logger.info(f"Moving weights to {loading_device}")
+            for key in sd.keys():
+                sd[key] = sd[key].to(loading_device)
+    info = model.load_state_dict(sd, strict=True, assign=True)
+    logger.info(f"Loaded DiT model from {dit_path}, info={info}")
+    return model

frame_pack/k_diffusion_hunyuan.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# original code: https://github.com/lllyasviel/FramePack
+# original license: Apache-2.0
+import torch
+import math
+# from diffusers_helper.k_diffusion.uni_pc_fm import sample_unipc
+# from diffusers_helper.k_diffusion.wrapper import fm_wrapper
+# from diffusers_helper.utils import repeat_to_batch_size
+from frame_pack.uni_pc_fm import sample_unipc
+from frame_pack.wrapper import fm_wrapper
+from frame_pack.utils import repeat_to_batch_size
+def flux_time_shift(t, mu=1.15, sigma=1.0):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def calculate_flux_mu(context_length, x1=256, y1=0.5, x2=4096, y2=1.15, exp_max=7.0):
+    k = (y2 - y1) / (x2 - x1)
+    b = y1 - k * x1
+    mu = k * context_length + b
+    mu = min(mu, math.log(exp_max))
+    return mu
+def get_flux_sigmas_from_mu(n, mu):
+    sigmas = torch.linspace(1, 0, steps=n + 1)
+    sigmas = flux_time_shift(sigmas, mu=mu)
+    return sigmas
+# @torch.inference_mode()
+def sample_hunyuan(
+    transformer,
+    sampler="unipc",
+    initial_latent=None,
+    concat_latent=None,
+    strength=1.0,
+    width=512,
+    height=512,
+    frames=16,
+    real_guidance_scale=1.0,
+    distilled_guidance_scale=6.0,
+    guidance_rescale=0.0,
+    shift=None,
+    num_inference_steps=25,
+    batch_size=None,
+    generator=None,
+    prompt_embeds=None,
+    prompt_embeds_mask=None,
+    prompt_poolers=None,
+    negative_prompt_embeds=None,
+    negative_prompt_embeds_mask=None,
+    negative_prompt_poolers=None,
+    dtype=torch.bfloat16,
+    device=None,
+    negative_kwargs=None,
+    callback=None,
+    **kwargs,
+):
+    device = device or transformer.device
+    if batch_size is None:
+        batch_size = int(prompt_embeds.shape[0])
+    latents = torch.randn(
+        (batch_size, 16, (frames + 3) // 4, height // 8, width // 8), generator=generator, device=generator.device
+    ).to(device=device, dtype=torch.float32)
+    B, C, T, H, W = latents.shape
+    seq_length = T * H * W // 4  # 9*80*80//4 = 14400
+    if shift is None:
+        mu = calculate_flux_mu(seq_length, exp_max=7.0)  # 1.9459... if seq_len is large, mu is clipped.
+    else:
+        mu = math.log(shift)
+    sigmas = get_flux_sigmas_from_mu(num_inference_steps, mu).to(device)
+    k_model = fm_wrapper(transformer)
+    if initial_latent is not None:
+        sigmas = sigmas * strength
+        first_sigma = sigmas[0].to(device=device, dtype=torch.float32)
+        initial_latent = initial_latent.to(device=device, dtype=torch.float32)
+        latents = initial_latent.float() * (1.0 - first_sigma) + latents.float() * first_sigma
+    if concat_latent is not None:
+        concat_latent = concat_latent.to(latents)
+    distilled_guidance = torch.tensor([distilled_guidance_scale * 1000.0] * batch_size).to(device=device, dtype=dtype)
+    prompt_embeds = repeat_to_batch_size(prompt_embeds, batch_size)
+    prompt_embeds_mask = repeat_to_batch_size(prompt_embeds_mask, batch_size)
+    prompt_poolers = repeat_to_batch_size(prompt_poolers, batch_size)
+    negative_prompt_embeds = repeat_to_batch_size(negative_prompt_embeds, batch_size)
+    negative_prompt_embeds_mask = repeat_to_batch_size(negative_prompt_embeds_mask, batch_size)
+    negative_prompt_poolers = repeat_to_batch_size(negative_prompt_poolers, batch_size)
+    concat_latent = repeat_to_batch_size(concat_latent, batch_size)
+    sampler_kwargs = dict(
+        dtype=dtype,
+        cfg_scale=real_guidance_scale,
+        cfg_rescale=guidance_rescale,
+        concat_latent=concat_latent,
+        positive=dict(
+            pooled_projections=prompt_poolers,
+            encoder_hidden_states=prompt_embeds,
+            encoder_attention_mask=prompt_embeds_mask,
+            guidance=distilled_guidance,
+            **kwargs,
+        ),
+        negative=dict(
+            pooled_projections=negative_prompt_poolers,
+            encoder_hidden_states=negative_prompt_embeds,
+            encoder_attention_mask=negative_prompt_embeds_mask,
+            guidance=distilled_guidance,
+            **(kwargs if negative_kwargs is None else {**kwargs, **negative_kwargs}),
+        ),
+    )
+    if sampler == "unipc":
+        results = sample_unipc(k_model, latents, sigmas, extra_args=sampler_kwargs, disable=False, callback=callback)
+    else:
+        raise NotImplementedError(f"Sampler {sampler} is not supported.")
+    return results

frame_pack/uni_pc_fm.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Better Flow Matching UniPC by Lvmin Zhang
+# (c) 2025
+# CC BY-SA 4.0
+# Attribution-ShareAlike 4.0 International Licence
+import torch
+from tqdm.auto import trange
+def expand_dims(v, dims):
+    return v[(...,) + (None,) * (dims - 1)]
+class FlowMatchUniPC:
+    def __init__(self, model, extra_args, variant='bh1'):
+        self.model = model
+        self.variant = variant
+        self.extra_args = extra_args
+    def model_fn(self, x, t):
+        return self.model(x, t, **self.extra_args)
+    def update_fn(self, x, model_prev_list, t_prev_list, t, order):
+        assert order <= len(model_prev_list)
+        dims = x.dim()
+        t_prev_0 = t_prev_list[-1]
+        lambda_prev_0 = - torch.log(t_prev_0)
+        lambda_t = - torch.log(t)
+        model_prev_0 = model_prev_list[-1]
+        h = lambda_t - lambda_prev_0
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            t_prev_i = t_prev_list[-(i + 1)]
+            model_prev_i = model_prev_list[-(i + 1)]
+            lambda_prev_i = - torch.log(t_prev_i)
+            rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
+            rks.append(rk)
+            D1s.append((model_prev_i - model_prev_0) / rk)
+        rks.append(1.)
+        rks = torch.tensor(rks, device=x.device)
+        R = []
+        b = []
+        hh = -h[0]
+        h_phi_1 = torch.expm1(hh)
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if self.variant == 'bh1':
+            B_h = hh
+        elif self.variant == 'bh2':
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError('Bad variant!')
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= (i + 1)
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=x.device)
+        use_predictor = len(D1s) > 0
+        if use_predictor:
+            D1s = torch.stack(D1s, dim=1)
+            if order == 2:
+                rhos_p = torch.tensor([0.5], device=b.device)
+            else:
+                rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
+        else:
+            D1s = None
+            rhos_p = None
+        if order == 1:
+            rhos_c = torch.tensor([0.5], device=b.device)
+        else:
+            rhos_c = torch.linalg.solve(R, b)
+        x_t_ = expand_dims(t / t_prev_0, dims) * x - expand_dims(h_phi_1, dims) * model_prev_0
+        if use_predictor:
+            pred_res = torch.tensordot(D1s, rhos_p, dims=([1], [0]))
+        else:
+            pred_res = 0
+        x_t = x_t_ - expand_dims(B_h, dims) * pred_res
+        model_t = self.model_fn(x_t, t)
+        if D1s is not None:
+            corr_res = torch.tensordot(D1s, rhos_c[:-1], dims=([1], [0]))
+        else:
+            corr_res = 0
+        D1_t = (model_t - model_prev_0)
+        x_t = x_t_ - expand_dims(B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
+        return x_t, model_t
+    def sample(self, x, sigmas, callback=None, disable_pbar=False):
+        order = min(3, len(sigmas) - 2)
+        model_prev_list, t_prev_list = [], []
+        for i in trange(len(sigmas) - 1, disable=disable_pbar):
+            vec_t = sigmas[i].expand(x.shape[0])
+            with torch.no_grad():
+                if i == 0:
+                    model_prev_list = [self.model_fn(x, vec_t)]
+                    t_prev_list = [vec_t]
+                elif i < order:
+                    init_order = i
+                    x, model_x = self.update_fn(x, model_prev_list, t_prev_list, vec_t, init_order)
+                    model_prev_list.append(model_x)
+                    t_prev_list.append(vec_t)
+                else:
+                    x, model_x = self.update_fn(x, model_prev_list, t_prev_list, vec_t, order)
+                    model_prev_list.append(model_x)
+                    t_prev_list.append(vec_t)
+            model_prev_list = model_prev_list[-order:]
+            t_prev_list = t_prev_list[-order:]
+            if callback is not None:
+                callback({'x': x, 'i': i, 'denoised': model_prev_list[-1]})
+        return model_prev_list[-1]
+def sample_unipc(model, noise, sigmas, extra_args=None, callback=None, disable=False, variant='bh1'):
+    assert variant in ['bh1', 'bh2']
+    return FlowMatchUniPC(model, extra_args=extra_args, variant=variant).sample(noise, sigmas=sigmas, callback=callback, disable_pbar=disable)

frame_pack/utils.py ADDED Viewed

	@@ -0,0 +1,617 @@

+import os
+import cv2
+import json
+import random
+import glob
+import torch
+import einops
+import numpy as np
+import datetime
+import torchvision
+import safetensors.torch as sf
+from PIL import Image
+def min_resize(x, m):
+    if x.shape[0] < x.shape[1]:
+        s0 = m
+        s1 = int(float(m) / float(x.shape[0]) * float(x.shape[1]))
+    else:
+        s0 = int(float(m) / float(x.shape[1]) * float(x.shape[0]))
+        s1 = m
+    new_max = max(s1, s0)
+    raw_max = max(x.shape[0], x.shape[1])
+    if new_max < raw_max:
+        interpolation = cv2.INTER_AREA
+    else:
+        interpolation = cv2.INTER_LANCZOS4
+    y = cv2.resize(x, (s1, s0), interpolation=interpolation)
+    return y
+def d_resize(x, y):
+    H, W, C = y.shape
+    new_min = min(H, W)
+    raw_min = min(x.shape[0], x.shape[1])
+    if new_min < raw_min:
+        interpolation = cv2.INTER_AREA
+    else:
+        interpolation = cv2.INTER_LANCZOS4
+    y = cv2.resize(x, (W, H), interpolation=interpolation)
+    return y
+def resize_and_center_crop(image, target_width, target_height):
+    if target_height == image.shape[0] and target_width == image.shape[1]:
+        return image
+    pil_image = Image.fromarray(image)
+    original_width, original_height = pil_image.size
+    scale_factor = max(target_width / original_width, target_height / original_height)
+    resized_width = int(round(original_width * scale_factor))
+    resized_height = int(round(original_height * scale_factor))
+    resized_image = pil_image.resize((resized_width, resized_height), Image.LANCZOS)
+    left = (resized_width - target_width) / 2
+    top = (resized_height - target_height) / 2
+    right = (resized_width + target_width) / 2
+    bottom = (resized_height + target_height) / 2
+    cropped_image = resized_image.crop((left, top, right, bottom))
+    return np.array(cropped_image)
+def resize_and_center_crop_pytorch(image, target_width, target_height):
+    B, C, H, W = image.shape
+    if H == target_height and W == target_width:
+        return image
+    scale_factor = max(target_width / W, target_height / H)
+    resized_width = int(round(W * scale_factor))
+    resized_height = int(round(H * scale_factor))
+    resized = torch.nn.functional.interpolate(image, size=(resized_height, resized_width), mode="bilinear", align_corners=False)
+    top = (resized_height - target_height) // 2
+    left = (resized_width - target_width) // 2
+    cropped = resized[:, :, top : top + target_height, left : left + target_width]
+    return cropped
+def resize_without_crop(image, target_width, target_height):
+    if target_height == image.shape[0] and target_width == image.shape[1]:
+        return image
+    pil_image = Image.fromarray(image)
+    resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
+    return np.array(resized_image)
+def just_crop(image, w, h):
+    if h == image.shape[0] and w == image.shape[1]:
+        return image
+    original_height, original_width = image.shape[:2]
+    k = min(original_height / h, original_width / w)
+    new_width = int(round(w * k))
+    new_height = int(round(h * k))
+    x_start = (original_width - new_width) // 2
+    y_start = (original_height - new_height) // 2
+    cropped_image = image[y_start : y_start + new_height, x_start : x_start + new_width]
+    return cropped_image
+def write_to_json(data, file_path):
+    temp_file_path = file_path + ".tmp"
+    with open(temp_file_path, "wt", encoding="utf-8") as temp_file:
+        json.dump(data, temp_file, indent=4)
+    os.replace(temp_file_path, file_path)
+    return
+def read_from_json(file_path):
+    with open(file_path, "rt", encoding="utf-8") as file:
+        data = json.load(file)
+    return data
+def get_active_parameters(m):
+    return {k: v for k, v in m.named_parameters() if v.requires_grad}
+def cast_training_params(m, dtype=torch.float32):
+    result = {}
+    for n, param in m.named_parameters():
+        if param.requires_grad:
+            param.data = param.to(dtype)
+            result[n] = param
+    return result
+def separate_lora_AB(parameters, B_patterns=None):
+    parameters_normal = {}
+    parameters_B = {}
+    if B_patterns is None:
+        B_patterns = [".lora_B.", "__zero__"]
+    for k, v in parameters.items():
+        if any(B_pattern in k for B_pattern in B_patterns):
+            parameters_B[k] = v
+        else:
+            parameters_normal[k] = v
+    return parameters_normal, parameters_B
+def set_attr_recursive(obj, attr, value):
+    attrs = attr.split(".")
+    for name in attrs[:-1]:
+        obj = getattr(obj, name)
+    setattr(obj, attrs[-1], value)
+    return
+def print_tensor_list_size(tensors):
+    total_size = 0
+    total_elements = 0
+    if isinstance(tensors, dict):
+        tensors = tensors.values()
+    for tensor in tensors:
+        total_size += tensor.nelement() * tensor.element_size()
+        total_elements += tensor.nelement()
+    total_size_MB = total_size / (1024**2)
+    total_elements_B = total_elements / 1e9
+    print(f"Total number of tensors: {len(tensors)}")
+    print(f"Total size of tensors: {total_size_MB:.2f} MB")
+    print(f"Total number of parameters: {total_elements_B:.3f} billion")
+    return
+@torch.no_grad()
+def batch_mixture(a, b=None, probability_a=0.5, mask_a=None):
+    batch_size = a.size(0)
+    if b is None:
+        b = torch.zeros_like(a)
+    if mask_a is None:
+        mask_a = torch.rand(batch_size) < probability_a
+    mask_a = mask_a.to(a.device)
+    mask_a = mask_a.reshape((batch_size,) + (1,) * (a.dim() - 1))
+    result = torch.where(mask_a, a, b)
+    return result
+@torch.no_grad()
+def zero_module(module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+@torch.no_grad()
+def supress_lower_channels(m, k, alpha=0.01):
+    data = m.weight.data.clone()
+    assert int(data.shape[1]) >= k
+    data[:, :k] = data[:, :k] * alpha
+    m.weight.data = data.contiguous().clone()
+    return m
+def freeze_module(m):
+    if not hasattr(m, "_forward_inside_frozen_module"):
+        m._forward_inside_frozen_module = m.forward
+    m.requires_grad_(False)
+    m.forward = torch.no_grad()(m.forward)
+    return m
+def get_latest_safetensors(folder_path):
+    safetensors_files = glob.glob(os.path.join(folder_path, "*.safetensors"))
+    if not safetensors_files:
+        raise ValueError("No file to resume!")
+    latest_file = max(safetensors_files, key=os.path.getmtime)
+    latest_file = os.path.abspath(os.path.realpath(latest_file))
+    return latest_file
+def generate_random_prompt_from_tags(tags_str, min_length=3, max_length=32):
+    tags = tags_str.split(", ")
+    tags = random.sample(tags, k=min(random.randint(min_length, max_length), len(tags)))
+    prompt = ", ".join(tags)
+    return prompt
+def interpolate_numbers(a, b, n, round_to_int=False, gamma=1.0):
+    numbers = a + (b - a) * (np.linspace(0, 1, n) ** gamma)
+    if round_to_int:
+        numbers = np.round(numbers).astype(int)
+    return numbers.tolist()
+def uniform_random_by_intervals(inclusive, exclusive, n, round_to_int=False):
+    edges = np.linspace(0, 1, n + 1)
+    points = np.random.uniform(edges[:-1], edges[1:])
+    numbers = inclusive + (exclusive - inclusive) * points
+    if round_to_int:
+        numbers = np.round(numbers).astype(int)
+    return numbers.tolist()
+def soft_append_bcthw(history, current, overlap=0):
+    if overlap <= 0:
+        return torch.cat([history, current], dim=2)
+    assert history.shape[2] >= overlap, f"History length ({history.shape[2]}) must be >= overlap ({overlap})"
+    assert current.shape[2] >= overlap, f"Current length ({current.shape[2]}) must be >= overlap ({overlap})"
+    weights = torch.linspace(1, 0, overlap, dtype=history.dtype, device=history.device).view(1, 1, -1, 1, 1)
+    blended = weights * history[:, :, -overlap:] + (1 - weights) * current[:, :, :overlap]
+    output = torch.cat([history[:, :, :-overlap], blended, current[:, :, overlap:]], dim=2)
+    return output.to(history)
+def save_bcthw_as_mp4(x, output_filename, fps=10):
+    b, c, t, h, w = x.shape
+    per_row = b
+    for p in [6, 5, 4, 3, 2]:
+        if b % p == 0:
+            per_row = p
+            break
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1.0, 1.0) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, "(m n) c t h w -> t (m h) (n w) c", n=per_row)
+    torchvision.io.write_video(output_filename, x, fps=fps, video_codec="libx264", options={"crf": "0"})
+    # write tensor as .pt file
+    torch.save(x, output_filename.replace(".mp4", ".pt"))
+    return x
+def save_bcthw_as_png(x, output_filename):
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1.0, 1.0) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, "b c t h w -> c (b h) (t w)")
+    torchvision.io.write_png(x, output_filename)
+    return output_filename
+def save_bchw_as_png(x, output_filename):
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1.0, 1.0) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, "b c h w -> c h (b w)")
+    torchvision.io.write_png(x, output_filename)
+    return output_filename
+def add_tensors_with_padding(tensor1, tensor2):
+    if tensor1.shape == tensor2.shape:
+        return tensor1 + tensor2
+    shape1 = tensor1.shape
+    shape2 = tensor2.shape
+    new_shape = tuple(max(s1, s2) for s1, s2 in zip(shape1, shape2))
+    padded_tensor1 = torch.zeros(new_shape)
+    padded_tensor2 = torch.zeros(new_shape)
+    padded_tensor1[tuple(slice(0, s) for s in shape1)] = tensor1
+    padded_tensor2[tuple(slice(0, s) for s in shape2)] = tensor2
+    result = padded_tensor1 + padded_tensor2
+    return result
+def print_free_mem():
+    torch.cuda.empty_cache()
+    free_mem, total_mem = torch.cuda.mem_get_info(0)
+    free_mem_mb = free_mem / (1024**2)
+    total_mem_mb = total_mem / (1024**2)
+    print(f"Free memory: {free_mem_mb:.2f} MB")
+    print(f"Total memory: {total_mem_mb:.2f} MB")
+    return
+def print_gpu_parameters(device, state_dict, log_count=1):
+    summary = {"device": device, "keys_count": len(state_dict)}
+    logged_params = {}
+    for i, (key, tensor) in enumerate(state_dict.items()):
+        if i >= log_count:
+            break
+        logged_params[key] = tensor.flatten()[:3].tolist()
+    summary["params"] = logged_params
+    print(str(summary))
+    return
+def visualize_txt_as_img(width, height, text, font_path="font/DejaVuSans.ttf", size=18):
+    from PIL import Image, ImageDraw, ImageFont
+    txt = Image.new("RGB", (width, height), color="white")
+    draw = ImageDraw.Draw(txt)
+    font = ImageFont.truetype(font_path, size=size)
+    if text == "":
+        return np.array(txt)
+    # Split text into lines that fit within the image width
+    lines = []
+    words = text.split()
+    current_line = words[0]
+    for word in words[1:]:
+        line_with_word = f"{current_line} {word}"
+        if draw.textbbox((0, 0), line_with_word, font=font)[2] <= width:
+            current_line = line_with_word
+        else:
+            lines.append(current_line)
+            current_line = word
+    lines.append(current_line)
+    # Draw the text line by line
+    y = 0
+    line_height = draw.textbbox((0, 0), "A", font=font)[3]
+    for line in lines:
+        if y + line_height > height:
+            break  # stop drawing if the next line will be outside the image
+        draw.text((0, y), line, fill="black", font=font)
+        y += line_height
+    return np.array(txt)
+def blue_mark(x):
+    x = x.copy()
+    c = x[:, :, 2]
+    b = cv2.blur(c, (9, 9))
+    x[:, :, 2] = ((c - b) * 16.0 + b).clip(-1, 1)
+    return x
+def green_mark(x):
+    x = x.copy()
+    x[:, :, 2] = -1
+    x[:, :, 0] = -1
+    return x
+def frame_mark(x):
+    x = x.copy()
+    x[:64] = -1
+    x[-64:] = -1
+    x[:, :8] = 1
+    x[:, -8:] = 1
+    return x
+@torch.inference_mode()
+def pytorch2numpy(imgs):
+    results = []
+    for x in imgs:
+        y = x.movedim(0, -1)
+        y = y * 127.5 + 127.5
+        y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
+        results.append(y)
+    return results
+@torch.inference_mode()
+def numpy2pytorch(imgs):
+    h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
+    h = h.movedim(-1, 1)
+    return h
+@torch.no_grad()
+def duplicate_prefix_to_suffix(x, count, zero_out=False):
+    if zero_out:
+        return torch.cat([x, torch.zeros_like(x[:count])], dim=0)
+    else:
+        return torch.cat([x, x[:count]], dim=0)
+def weighted_mse(a, b, weight):
+    return torch.mean(weight.float() * (a.float() - b.float()) ** 2)
+def clamped_linear_interpolation(x, x_min, y_min, x_max, y_max, sigma=1.0):
+    x = (x - x_min) / (x_max - x_min)
+    x = max(0.0, min(x, 1.0))
+    x = x**sigma
+    return y_min + x * (y_max - y_min)
+def expand_to_dims(x, target_dims):
+    return x.view(*x.shape, *([1] * max(0, target_dims - x.dim())))
+def repeat_to_batch_size(tensor: torch.Tensor, batch_size: int):
+    if tensor is None:
+        return None
+    first_dim = tensor.shape[0]
+    if first_dim == batch_size:
+        return tensor
+    if batch_size % first_dim != 0:
+        raise ValueError(f"Cannot evenly repeat first dim {first_dim} to match batch_size {batch_size}.")
+    repeat_times = batch_size // first_dim
+    return tensor.repeat(repeat_times, *[1] * (tensor.dim() - 1))
+def dim5(x):
+    return expand_to_dims(x, 5)
+def dim4(x):
+    return expand_to_dims(x, 4)
+def dim3(x):
+    return expand_to_dims(x, 3)
+def crop_or_pad_yield_mask(x, length):
+    B, F, C = x.shape
+    device = x.device
+    dtype = x.dtype
+    if F < length:
+        y = torch.zeros((B, length, C), dtype=dtype, device=device)
+        mask = torch.zeros((B, length), dtype=torch.bool, device=device)
+        y[:, :F, :] = x
+        mask[:, :F] = True
+        return y, mask
+    return x[:, :length, :], torch.ones((B, length), dtype=torch.bool, device=device)
+def extend_dim(x, dim, minimal_length, zero_pad=False):
+    original_length = int(x.shape[dim])
+    if original_length >= minimal_length:
+        return x
+    if zero_pad:
+        padding_shape = list(x.shape)
+        padding_shape[dim] = minimal_length - original_length
+        padding = torch.zeros(padding_shape, dtype=x.dtype, device=x.device)
+    else:
+        idx = (slice(None),) * dim + (slice(-1, None),) + (slice(None),) * (len(x.shape) - dim - 1)
+        last_element = x[idx]
+        padding = last_element.repeat_interleave(minimal_length - original_length, dim=dim)
+    return torch.cat([x, padding], dim=dim)
+def lazy_positional_encoding(t, repeats=None):
+    if not isinstance(t, list):
+        t = [t]
+    from diffusers.models.embeddings import get_timestep_embedding
+    te = torch.tensor(t)
+    te = get_timestep_embedding(timesteps=te, embedding_dim=256, flip_sin_to_cos=True, downscale_freq_shift=0.0, scale=1.0)
+    if repeats is None:
+        return te
+    te = te[:, None, :].expand(-1, repeats, -1)
+    return te
+def state_dict_offset_merge(A, B, C=None):
+    result = {}
+    keys = A.keys()
+    for key in keys:
+        A_value = A[key]
+        B_value = B[key].to(A_value)
+        if C is None:
+            result[key] = A_value + B_value
+        else:
+            C_value = C[key].to(A_value)
+            result[key] = A_value + B_value - C_value
+    return result
+def state_dict_weighted_merge(state_dicts, weights):
+    if len(state_dicts) != len(weights):
+        raise ValueError("Number of state dictionaries must match number of weights")
+    if not state_dicts:
+        return {}
+    total_weight = sum(weights)
+    if total_weight == 0:
+        raise ValueError("Sum of weights cannot be zero")
+    normalized_weights = [w / total_weight for w in weights]
+    keys = state_dicts[0].keys()
+    result = {}
+    for key in keys:
+        result[key] = state_dicts[0][key] * normalized_weights[0]
+        for i in range(1, len(state_dicts)):
+            state_dict_value = state_dicts[i][key].to(result[key])
+            result[key] += state_dict_value * normalized_weights[i]
+    return result
+def group_files_by_folder(all_files):
+    grouped_files = {}
+    for file in all_files:
+        folder_name = os.path.basename(os.path.dirname(file))
+        if folder_name not in grouped_files:
+            grouped_files[folder_name] = []
+        grouped_files[folder_name].append(file)
+    list_of_lists = list(grouped_files.values())
+    return list_of_lists
+def generate_timestamp():
+    now = datetime.datetime.now()
+    timestamp = now.strftime("%y%m%d_%H%M%S")
+    milliseconds = f"{int(now.microsecond / 1000):03d}"
+    random_number = random.randint(0, 9999)
+    return f"{timestamp}_{milliseconds}_{random_number}"
+def write_PIL_image_with_png_info(image, metadata, path):
+    from PIL.PngImagePlugin import PngInfo
+    png_info = PngInfo()
+    for key, value in metadata.items():
+        png_info.add_text(key, value)
+    image.save(path, "PNG", pnginfo=png_info)
+    return image
+def torch_safe_save(content, path):
+    torch.save(content, path + "_tmp")
+    os.replace(path + "_tmp", path)
+    return path
+def move_optimizer_to_device(optimizer, device):
+    for state in optimizer.state.values():
+        for k, v in state.items():
+            if isinstance(v, torch.Tensor):
+                state[k] = v.to(device)

frame_pack/wrapper.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+def append_dims(x, target_dims):
+    return x[(...,) + (None,) * (target_dims - x.ndim)]
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=1.0):
+    if guidance_rescale == 0:
+        return noise_cfg
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1.0 - guidance_rescale) * noise_cfg
+    return noise_cfg
+def fm_wrapper(transformer, t_scale=1000.0):
+    def k_model(x, sigma, **extra_args):
+        dtype = extra_args['dtype']
+        cfg_scale = extra_args['cfg_scale']
+        cfg_rescale = extra_args['cfg_rescale']
+        concat_latent = extra_args['concat_latent']
+        original_dtype = x.dtype
+        sigma = sigma.float()
+        x = x.to(dtype)
+        timestep = (sigma * t_scale).to(dtype)
+        if concat_latent is None:
+            hidden_states = x
+        else:
+            hidden_states = torch.cat([x, concat_latent.to(x)], dim=1)
+        pred_positive = transformer(hidden_states=hidden_states, timestep=timestep, return_dict=False, **extra_args['positive'])[0].float()
+        if cfg_scale == 1.0:
+            pred_negative = torch.zeros_like(pred_positive)
+        else:
+            pred_negative = transformer(hidden_states=hidden_states, timestep=timestep, return_dict=False, **extra_args['negative'])[0].float()
+        pred_cfg = pred_negative + cfg_scale * (pred_positive - pred_negative)
+        pred = rescale_noise_cfg(pred_cfg, pred_positive, guidance_rescale=cfg_rescale)
+        x0 = x.float() - pred.float() * append_dims(sigma, x.ndim)
+        return x0.to(dtype=original_dtype)
+    return k_model

hunyuan_model/fp8_optimization.py ADDED Viewed

	@@ -0,0 +1,39 @@

+#based on ComfyUI's and MinusZoneAI's fp8_linear optimization
+#further borrowed from HunyuanVideoWrapper for Musubi Tuner
+import torch
+import torch.nn as nn
+def fp8_linear_forward(cls, original_dtype, input):
+    weight_dtype = cls.weight.dtype
+    if weight_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+        if len(input.shape) == 3:
+            target_dtype = torch.float8_e5m2 if weight_dtype == torch.float8_e4m3fn else torch.float8_e4m3fn
+            inn = input.reshape(-1, input.shape[2]).to(target_dtype)
+            w = cls.weight.t()
+            scale = torch.ones((1), device=input.device, dtype=torch.float32)
+            bias = cls.bias.to(original_dtype) if cls.bias is not None else None
+            if bias is not None:
+                o = torch._scaled_mm(inn, w, out_dtype=original_dtype, bias=bias, scale_a=scale, scale_b=scale)
+            else:
+                o = torch._scaled_mm(inn, w, out_dtype=original_dtype, scale_a=scale, scale_b=scale)
+            if isinstance(o, tuple):
+                o = o[0]
+            return o.reshape((-1, input.shape[1], cls.weight.shape[0]))
+        else:
+            return cls.original_forward(input.to(original_dtype))
+    else:
+        return cls.original_forward(input)
+def convert_fp8_linear(module, original_dtype, params_to_keep={}):
+    setattr(module, "fp8_matmul_enabled", True)
+    for name, module in module.named_modules():
+        if not any(keyword in name for keyword in params_to_keep):
+            if isinstance(module, nn.Linear):
+                original_forward = module.forward
+                setattr(module, "original_forward", original_forward)
+                setattr(module, "forward", lambda input, m=module: fp8_linear_forward(m, original_dtype, input))

hv_generate_video.py CHANGED Viewed

@@ -25,6 +25,7 @@ from hunyuan_model.text_encoder import TextEncoder
 from hunyuan_model.text_encoder import PROMPT_TEMPLATE
 from hunyuan_model.vae import load_vae
 from hunyuan_model.models import load_transformer, get_rotary_pos_embed
 from modules.scheduling_flow_match_discrete import FlowMatchDiscreteScheduler
 from networks import lora
@@ -313,23 +314,6 @@ def encode_input_prompt(prompt: Union[str, list[str]], args, device, fp8_llm=Fal
 # endregion
-def load_images(image_dir, video_length, bucket_reso):
-    image_files = glob_images(image_dir)
-    if len(image_files) == 0:
-        raise ValueError(f"No image files found in {image_dir}")
-    if len(image_files) < video_length:
-        raise ValueError(f"Number of images in {image_dir} is less than {video_length}")
-    image_files.sort()
-    images = []
-    for image_file in image_files[:video_length]:
-        image = Image.open(image_file)
-        image = resize_image_to_bucket(image, bucket_reso)  # returns a numpy array
-        images.append(image)
-    return images
 def prepare_vae(args, device):
     vae_dtype = torch.float16 if args.vae_dtype is None else str_to_dtype(args.vae_dtype)
     vae, _, s_ratio, t_ratio = load_vae(vae_dtype=vae_dtype, device=device, vae_path=args.vae)
@@ -479,6 +463,15 @@ def parse_args():
     parser.add_argument("--no_metadata", action="store_true", help="do not save metadata")
     parser.add_argument("--latent_path", type=str, nargs="*", default=None, help="path to latent for decode. no inference")
     parser.add_argument("--lycoris", action="store_true", help="use lycoris for inference")
     args = parser.parse_args()
@@ -488,6 +481,9 @@ def parse_args():
     # update dit_weight based on model_base if not exists
     return args
@@ -573,12 +569,7 @@ def main():
         if args.video_path is not None:
             # v2v inference
             logger.info(f"Video2Video inference: {args.video_path}")
-            if os.path.isfile(args.video_path):
-                video = load_video(args.video_path, 0, video_length, bucket_reso=(width, height))  # list of frames
-            else:
-                video = load_images(args.video_path, video_length, bucket_reso=(width, height))  # list of frames
             if len(video) < video_length:
                 raise ValueError(f"Video length is less than {video_length}")
             video = np.stack(video, axis=0)  # F, H, W, C
@@ -682,16 +673,50 @@ def main():
                 logger.info("Merged model saved")
                 return
         if blocks_to_swap > 0:
-            logger.info(f"Casting model to {dit_weight_dtype}")
-            transformer.to(dtype=dit_weight_dtype)
             logger.info(f"Enable swap {blocks_to_swap} blocks to CPU from device: {device}")
             transformer.enable_block_swap(blocks_to_swap, device, supports_backward=False)
             transformer.move_to_device_except_swap_blocks(device)
             transformer.prepare_block_swap_before_forward()
         else:
-            logger.info(f"Moving and casting model to {device} and {dit_weight_dtype}")
-            transformer.to(device=device, dtype=dit_weight_dtype)
         if args.img_in_txt_in_offloading:
             logger.info("Enable offloading img_in and txt_in to CPU")
             transformer.enable_img_in_txt_in_offloading()

 from hunyuan_model.text_encoder import PROMPT_TEMPLATE
 from hunyuan_model.vae import load_vae
 from hunyuan_model.models import load_transformer, get_rotary_pos_embed
+from hunyuan_model.fp8_optimization import convert_fp8_linear
 from modules.scheduling_flow_match_discrete import FlowMatchDiscreteScheduler
 from networks import lora
 # endregion
 def prepare_vae(args, device):
     vae_dtype = torch.float16 if args.vae_dtype is None else str_to_dtype(args.vae_dtype)
     vae, _, s_ratio, t_ratio = load_vae(vae_dtype=vae_dtype, device=device, vae_path=args.vae)
     parser.add_argument("--no_metadata", action="store_true", help="do not save metadata")
     parser.add_argument("--latent_path", type=str, nargs="*", default=None, help="path to latent for decode. no inference")
     parser.add_argument("--lycoris", action="store_true", help="use lycoris for inference")
+    parser.add_argument("--fp8_fast", action="store_true", help="Enable fast FP8 arthimetic(RTX 4XXX+)")
+    parser.add_argument("--compile", action="store_true", help="Enable torch.compile")
+    parser.add_argument(
+        "--compile_args",
+        nargs=4,
+        metavar=("BACKEND", "MODE", "DYNAMIC", "FULLGRAPH"),
+        default=["inductor", "max-autotune-no-cudagraphs", "False", "False"],
+        help="Torch.compile settings",
+    )
     args = parser.parse_args()
     # update dit_weight based on model_base if not exists
+    if args.fp8_fast and not args.fp8:
+        raise ValueError("--fp8_fast requires --fp8")
     return args
         if args.video_path is not None:
             # v2v inference
             logger.info(f"Video2Video inference: {args.video_path}")
+            video = load_video(args.video_path, 0, video_length, bucket_reso=(width, height))  # list of frames
             if len(video) < video_length:
                 raise ValueError(f"Video length is less than {video_length}")
             video = np.stack(video, axis=0)  # F, H, W, C
                 logger.info("Merged model saved")
                 return
+        logger.info(f"Casting model to {dit_weight_dtype}")
+        transformer.to(dtype=dit_weight_dtype)
+        if args.fp8_fast:
+            logger.info("Enabling FP8 acceleration")
+            params_to_keep = {"norm", "bias", "time_in", "vector_in", "guidance_in", "txt_in", "img_in"}
+            for name, param in transformer.named_parameters():
+                dtype_to_use = dit_dtype if any(keyword in name for keyword in params_to_keep) else dit_weight_dtype
+                param.to(dtype=dtype_to_use)
+            convert_fp8_linear(transformer, dit_dtype, params_to_keep=params_to_keep)
+        if args.compile:
+            compile_backend, compile_mode, compile_dynamic, compile_fullgraph = args.compile_args
+            logger.info(
+                f"Torch Compiling[Backend: {compile_backend}; Mode: {compile_mode}; Dynamic: {compile_dynamic}; Fullgraph: {compile_fullgraph}]"
+            )
+            torch._dynamo.config.cache_size_limit = 32
+            for i, block in enumerate(transformer.single_blocks):
+                compiled_block = torch.compile(
+                    block,
+                    backend=compile_backend,
+                    mode=compile_mode,
+                    dynamic=compile_dynamic.lower() in "true",
+                    fullgraph=compile_fullgraph.lower() in "true",
+                )
+                transformer.single_blocks[i] = compiled_block
+            for i, block in enumerate(transformer.double_blocks):
+                compiled_block = torch.compile(
+                    block,
+                    backend=compile_backend,
+                    mode=compile_mode,
+                    dynamic=compile_dynamic.lower() in "true",
+                    fullgraph=compile_fullgraph.lower() in "true",
+                )
+                transformer.double_blocks[i] = compiled_block
         if blocks_to_swap > 0:
             logger.info(f"Enable swap {blocks_to_swap} blocks to CPU from device: {device}")
             transformer.enable_block_swap(blocks_to_swap, device, supports_backward=False)
             transformer.move_to_device_except_swap_blocks(device)
             transformer.prepare_block_swap_before_forward()
         else:
+            logger.info(f"Moving model to {device}")
+            transformer.to(device=device)
         if args.img_in_txt_in_offloading:
             logger.info("Enable offloading img_in and txt_in to CPU")
             transformer.enable_img_in_txt_in_offloading()

hv_train_network.py CHANGED Viewed

@@ -24,7 +24,7 @@ import toml
 import torch
 from tqdm import tqdm
-from accelerate.utils import set_seed
 from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs, PartialState
 from safetensors.torch import load_file
 import transformers
@@ -159,11 +159,21 @@ def prepare_accelerator(args: argparse.Namespace) -> Accelerator:
     ]
     kwargs_handlers = [i for i in kwargs_handlers if i is not None]
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
         mixed_precision=args.mixed_precision,
         log_with=log_with,
         project_dir=logging_dir,
         kwargs_handlers=kwargs_handlers,
     )
     print("accelerator device:", accelerator.device)
@@ -228,6 +238,25 @@ def line_to_prompt_dict(line: str) -> dict:
                 prompt_dict["image_path"] = m.group(1)
                 continue
         except ValueError as ex:
             logger.error(f"Exception in parsing / 解析エラー: {parg}")
             logger.error(ex)
@@ -340,8 +369,7 @@ def should_sample_images(args, steps, epoch=None):
 class NetworkTrainer:
     def __init__(self):
-        self._i2v_training = False
-        self.pos_embed_cache = {}
     # TODO 他のスクリプトと共通化する
     def generate_step_logs(
@@ -872,7 +900,7 @@ class NetworkTrainer:
         transformer.switch_block_swap_for_inference()
         # Create a directory to save the samples
-        save_dir = args.output_dir + "/sample"
         os.makedirs(save_dir, exist_ok=True)
         # save random state to restore later
@@ -919,13 +947,15 @@ class NetworkTrainer:
         width = sample_parameter.get("width", 256)  # make smaller for faster and memory saving inference
         height = sample_parameter.get("height", 256)
         frame_count = sample_parameter.get("frame_count", 1)
-        guidance_scale = sample_parameter.get("guidance_scale", 6.0)
         discrete_flow_shift = sample_parameter.get("discrete_flow_shift", 14.5)
         seed = sample_parameter.get("seed")
         prompt: str = sample_parameter.get("prompt", "")
         cfg_scale = sample_parameter.get("cfg_scale", None)  # None for architecture default
         negative_prompt = sample_parameter.get("negative_prompt", None)
         if self.i2v_training:
             image_path = sample_parameter.get("image_path", None)
             if image_path is None:
@@ -934,6 +964,16 @@ class NetworkTrainer:
         else:
             image_path = None
         device = accelerator.device
         if seed is not None:
             torch.manual_seed(seed)
@@ -963,6 +1003,8 @@ class NetworkTrainer:
         if self.i2v_training:
             logger.info(f"image path: {image_path}")
         # inference: architecture dependent
         video = self.do_inference(
@@ -982,9 +1024,14 @@ class NetworkTrainer:
             guidance_scale,
             cfg_scale,
             image_path=image_path,
         )
         # Save video
         ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
         num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
         seed_suffix = "" if seed is None else f"_{seed}"
@@ -1011,15 +1058,25 @@ class NetworkTrainer:
     def architecture_full_name(self) -> str:
         return ARCHITECTURE_HUNYUAN_VIDEO_FULL
-    def assert_model_specific_args(self, args: argparse.Namespace):
         self._i2v_training = args.dit_in_channels == 32  # may be changed in the future
         if self._i2v_training:
             logger.info("I2V training mode")
     @property
     def i2v_training(self) -> bool:
         return self._i2v_training
     def process_sample_prompts(
         self,
         args: argparse.Namespace,
@@ -1108,6 +1165,7 @@ class NetworkTrainer:
         guidance_scale,
         cfg_scale,
         image_path=None,
     ):
         """architecture dependent inference"""
         device = accelerator.device
@@ -1260,12 +1318,13 @@ class NetworkTrainer:
     def load_transformer(
         self,
         args: argparse.Namespace,
         dit_path: str,
         attn_mode: str,
         split_attn: bool,
         loading_device: str,
-        dit_weight_dtype: torch.dtype,
     ):
         transformer = load_transformer(dit_path, attn_mode, split_attn, loading_device, dit_weight_dtype, args.dit_in_channels)
@@ -1346,9 +1405,16 @@ class NetworkTrainer:
             raise ValueError("dataset_config is required / dataset_configが必要です")
         if args.dit is None:
             raise ValueError("path to DiT model is required / DiTモデルのパスが必要です")
         # check model specific arguments
-        self.assert_model_specific_args(args)
         # show timesteps for debugging
         if args.show_timesteps:
@@ -1389,7 +1455,7 @@ class NetworkTrainer:
         # HunyuanVideo: bfloat16 or float16, Wan2.1: bfloat16
         dit_dtype = torch.bfloat16 if args.dit_dtype is None else model_utils.str_to_dtype(args.dit_dtype)
-        dit_weight_dtype = torch.float8_e4m3fn if args.fp8_base else dit_dtype
         logger.info(f"DiT precision: {dit_dtype}, weight precision: {dit_weight_dtype}")
         # get embedding for sampling images
@@ -1406,6 +1472,7 @@ class NetworkTrainer:
         # load DiT model
         blocks_to_swap = args.blocks_to_swap if args.blocks_to_swap else 0
         loading_device = "cpu" if blocks_to_swap > 0 else accelerator.device
         logger.info(f"Loading DiT model from {args.dit}")
@@ -1423,7 +1490,9 @@ class NetworkTrainer:
             raise ValueError(
                 f"either --sdpa, --flash-attn, --flash3, --sage-attn or --xformers must be specified / --sdpa, --flash-attn, --flash3, --sage-attn, --xformersのいずれかを指定してください"
             )
-        transformer = self.load_transformer(args, args.dit, attn_mode, args.split_attn, loading_device, dit_weight_dtype)
         transformer.eval()
         transformer.requires_grad_(False)
@@ -1565,7 +1634,7 @@ class NetworkTrainer:
             network_dtype = weight_dtype
             network.to(network_dtype)
-        if dit_weight_dtype != dit_dtype:
             logger.info(f"casting model to {dit_weight_dtype}")
             transformer.to(dit_weight_dtype)
@@ -2239,6 +2308,34 @@ def setup_parser_common() -> argparse.ArgumentParser:
     # parser.add_argument("--full_fp16", action="store_true", help="fp16 training including gradients / 勾配も含めてfp16で学習する")
     # parser.add_argument("--full_bf16", action="store_true", help="bf16 training including gradients / 勾配も含めてbf16で学習する")
     parser.add_argument(
         "--blocks_to_swap",
         type=int,
@@ -2590,5 +2687,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
     args = read_config_from_file(args, parser)
     trainer = NetworkTrainer()
     trainer.train(args)

 import torch
 from tqdm import tqdm
+from accelerate.utils import TorchDynamoPlugin, set_seed, DynamoBackend
 from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs, PartialState
 from safetensors.torch import load_file
 import transformers
     ]
     kwargs_handlers = [i for i in kwargs_handlers if i is not None]
+    dynamo_plugin = None
+    if args.dynamo_backend.upper() != "NO":
+        dynamo_plugin = TorchDynamoPlugin(
+            backend=DynamoBackend(args.dynamo_backend.upper()),
+            mode=args.dynamo_mode,
+            fullgraph=args.dynamo_fullgraph,
+            dynamic=args.dynamo_dynamic,
+        )
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
         mixed_precision=args.mixed_precision,
         log_with=log_with,
         project_dir=logging_dir,
+        dynamo_plugin=dynamo_plugin,
         kwargs_handlers=kwargs_handlers,
     )
     print("accelerator device:", accelerator.device)
                 prompt_dict["image_path"] = m.group(1)
                 continue
+            m = re.match(r"cn (.+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["control_video_path"] = m.group(1)
+                continue
+            m = re.match(r"ci (.+)", parg, re.IGNORECASE)
+            if m:
+                # can be multiple control images
+                control_image_path = m.group(1)
+                if "control_image_path" not in prompt_dict:
+                    prompt_dict["control_image_path"] = []
+                prompt_dict["control_image_path"].append(control_image_path)
+                continue
+            m = re.match(r"of (.+)", parg, re.IGNORECASE)
+            if m:  # output folder
+                prompt_dict["one_frame"] = m.group(1)
+                continue
         except ValueError as ex:
             logger.error(f"Exception in parsing / 解析エラー: {parg}")
             logger.error(ex)
 class NetworkTrainer:
     def __init__(self):
+        self.blocks_to_swap = None
     # TODO 他のスクリプトと共通化する
     def generate_step_logs(
         transformer.switch_block_swap_for_inference()
         # Create a directory to save the samples
+        save_dir = os.path.join(args.output_dir, "sample")
         os.makedirs(save_dir, exist_ok=True)
         # save random state to restore later
         width = sample_parameter.get("width", 256)  # make smaller for faster and memory saving inference
         height = sample_parameter.get("height", 256)
         frame_count = sample_parameter.get("frame_count", 1)
+        guidance_scale = sample_parameter.get("guidance_scale", self.default_guidance_scale)
         discrete_flow_shift = sample_parameter.get("discrete_flow_shift", 14.5)
         seed = sample_parameter.get("seed")
         prompt: str = sample_parameter.get("prompt", "")
         cfg_scale = sample_parameter.get("cfg_scale", None)  # None for architecture default
         negative_prompt = sample_parameter.get("negative_prompt", None)
+        frame_count = (frame_count - 1) // 4 * 4 + 1  # 1, 5, 9, 13, ... For HunyuanVideo and Wan2.1
         if self.i2v_training:
             image_path = sample_parameter.get("image_path", None)
             if image_path is None:
         else:
             image_path = None
+        if self.control_training:
+            control_video_path = sample_parameter.get("control_video_path", None)
+            if control_video_path is None:
+                logger.error(
+                    "No control_video_path for control model / controlモデルのサンプル画像生成にはcontrol_video_pathが必要です"
+                )
+                return
+        else:
+            control_video_path = None
         device = accelerator.device
         if seed is not None:
             torch.manual_seed(seed)
         if self.i2v_training:
             logger.info(f"image path: {image_path}")
+        if self.control_training:
+            logger.info(f"control video path: {control_video_path}")
         # inference: architecture dependent
         video = self.do_inference(
             guidance_scale,
             cfg_scale,
             image_path=image_path,
+            control_video_path=control_video_path,
         )
         # Save video
+        if video is None:
+            logger.error("No video generated / 生成された動画がありません")
+            return
         ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
         num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
         seed_suffix = "" if seed is None else f"_{seed}"
     def architecture_full_name(self) -> str:
         return ARCHITECTURE_HUNYUAN_VIDEO_FULL
+    def handle_model_specific_args(self, args: argparse.Namespace):
+        self.pos_embed_cache = {}
         self._i2v_training = args.dit_in_channels == 32  # may be changed in the future
         if self._i2v_training:
             logger.info("I2V training mode")
+        self._control_training = False  # HunyuanVideo does not support control training yet
+        self.default_guidance_scale = 6.0
     @property
     def i2v_training(self) -> bool:
         return self._i2v_training
+    @property
+    def control_training(self) -> bool:
+        return self._control_training
     def process_sample_prompts(
         self,
         args: argparse.Namespace,
         guidance_scale,
         cfg_scale,
         image_path=None,
+        control_video_path=None,
     ):
         """architecture dependent inference"""
         device = accelerator.device
     def load_transformer(
         self,
+        accelerator: Accelerator,
         args: argparse.Namespace,
         dit_path: str,
         attn_mode: str,
         split_attn: bool,
         loading_device: str,
+        dit_weight_dtype: Optional[torch.dtype],
     ):
         transformer = load_transformer(dit_path, attn_mode, split_attn, loading_device, dit_weight_dtype, args.dit_in_channels)
             raise ValueError("dataset_config is required / dataset_configが必要です")
         if args.dit is None:
             raise ValueError("path to DiT model is required / DiTモデルのパスが必要です")
+        assert not args.fp8_scaled or args.fp8_base, "fp8_scaled requires fp8_base / fp8_scaledはfp8_baseが必要です"
+        if args.sage_attn:
+            raise ValueError(
+                "SageAttention doesn't support training currently. Please use `--sdpa` or `--xformers` etc. instead."
+                " / SageAttentionは現在学習をサポートしていないようです。`--sdpa`や`--xformers`などの他のオプションを使ってください"
+            )
         # check model specific arguments
+        self.handle_model_specific_args(args)
         # show timesteps for debugging
         if args.show_timesteps:
         # HunyuanVideo: bfloat16 or float16, Wan2.1: bfloat16
         dit_dtype = torch.bfloat16 if args.dit_dtype is None else model_utils.str_to_dtype(args.dit_dtype)
+        dit_weight_dtype = (None if args.fp8_scaled else torch.float8_e4m3fn) if args.fp8_base else dit_dtype
         logger.info(f"DiT precision: {dit_dtype}, weight precision: {dit_weight_dtype}")
         # get embedding for sampling images
         # load DiT model
         blocks_to_swap = args.blocks_to_swap if args.blocks_to_swap else 0
+        self.blocks_to_swap = blocks_to_swap
         loading_device = "cpu" if blocks_to_swap > 0 else accelerator.device
         logger.info(f"Loading DiT model from {args.dit}")
             raise ValueError(
                 f"either --sdpa, --flash-attn, --flash3, --sage-attn or --xformers must be specified / --sdpa, --flash-attn, --flash3, --sage-attn, --xformersのいずれかを指定してください"
             )
+        transformer = self.load_transformer(
+            accelerator, args, args.dit, attn_mode, args.split_attn, loading_device, dit_weight_dtype
+        )
         transformer.eval()
         transformer.requires_grad_(False)
             network_dtype = weight_dtype
             network.to(network_dtype)
+        if dit_weight_dtype != dit_dtype and dit_weight_dtype is not None:
             logger.info(f"casting model to {dit_weight_dtype}")
             transformer.to(dit_weight_dtype)
     # parser.add_argument("--full_fp16", action="store_true", help="fp16 training including gradients / 勾配も含めてfp16で学習する")
     # parser.add_argument("--full_bf16", action="store_true", help="bf16 training including gradients / 勾配も含めてbf16で学習する")
+    parser.add_argument(
+        "--dynamo_backend",
+        type=str,
+        default="NO",
+        choices=[e.value for e in DynamoBackend],
+        help="dynamo backend type (default is None) / dynamoのbackendの種類（デフォルトは None）",
+    )
+    parser.add_argument(
+        "--dynamo_mode",
+        type=str,
+        default=None,
+        choices=["default", "reduce-overhead", "max-autotune"],
+        help="dynamo mode (default is default) / dynamoのモード（デフォルトは default）",
+    )
+    parser.add_argument(
+        "--dynamo_fullgraph",
+        action="store_true",
+        help="use fullgraph mode for dynamo / dynamoのfullgraphモードを使う",
+    )
+    parser.add_argument(
+        "--dynamo_dynamic",
+        action="store_true",
+        help="use dynamic mode for dynamo / dynamoのdynamicモードを使う",
+    )
     parser.add_argument(
         "--blocks_to_swap",
         type=int,
     args = parser.parse_args()
     args = read_config_from_file(args, parser)
+    args.fp8_scaled = False  # HunyuanVideo does not support this yet
     trainer = NetworkTrainer()
     trainer.train(args)

merge_lora.py CHANGED Viewed

@@ -45,7 +45,7 @@ def main():
             logger.info(f"Loading LoRA weights from {lora_weight} with multiplier {lora_multiplier}")
             weights_sd = load_file(lora_weight)
-            network = lora.create_network_from_weights_hunyuan_video(
                 lora_multiplier, weights_sd, unet=transformer, for_inference=True
             )
             logger.info("Merging LoRA weights to DiT model")

             logger.info(f"Loading LoRA weights from {lora_weight} with multiplier {lora_multiplier}")
             weights_sd = load_file(lora_weight)
+            network = lora.create_arch_network_from_weights(
                 lora_multiplier, weights_sd, unet=transformer, for_inference=True
             )
             logger.info("Merging LoRA weights to DiT model")

modules/fp8_optimization_utils.py ADDED Viewed

	@@ -0,0 +1,356 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import logging
+from tqdm import tqdm
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+from utils.device_utils import clean_memory_on_device
+def calculate_fp8_maxval(exp_bits=4, mantissa_bits=3, sign_bits=1):
+    """
+    Calculate the maximum representable value in FP8 format.
+    Default is E4M3 format (4-bit exponent, 3-bit mantissa, 1-bit sign).
+    Args:
+        exp_bits (int): Number of exponent bits
+        mantissa_bits (int): Number of mantissa bits
+        sign_bits (int): Number of sign bits (0 or 1)
+    Returns:
+        float: Maximum value representable in FP8 format
+    """
+    assert exp_bits + mantissa_bits + sign_bits == 8, "Total bits must be 8"
+    # Calculate exponent bias
+    bias = 2 ** (exp_bits - 1) - 1
+    # Calculate maximum mantissa value
+    mantissa_max = 1.0
+    for i in range(mantissa_bits - 1):
+        mantissa_max += 2 ** -(i + 1)
+    # Calculate maximum value
+    max_value = mantissa_max * (2 ** (2**exp_bits - 1 - bias))
+    return max_value
+def quantize_tensor_to_fp8(tensor, scale, exp_bits=4, mantissa_bits=3, sign_bits=1, max_value=None, min_value=None):
+    """
+    Quantize a tensor to FP8 format.
+    Args:
+        tensor (torch.Tensor): Tensor to quantize
+        scale (float or torch.Tensor): Scale factor
+        exp_bits (int): Number of exponent bits
+        mantissa_bits (int): Number of mantissa bits
+        sign_bits (int): Number of sign bits
+    Returns:
+        tuple: (quantized_tensor, scale_factor)
+    """
+    # Create scaled tensor
+    scaled_tensor = tensor / scale
+    # Calculate FP8 parameters
+    bias = 2 ** (exp_bits - 1) - 1
+    if max_value is None:
+        # Calculate max and min values
+        max_value = calculate_fp8_maxval(exp_bits, mantissa_bits, sign_bits)
+        min_value = -max_value if sign_bits > 0 else 0.0
+    # Clamp tensor to range
+    clamped_tensor = torch.clamp(scaled_tensor, min_value, max_value)
+    # Quantization process
+    abs_values = torch.abs(clamped_tensor)
+    nonzero_mask = abs_values > 0
+    # Calculate log scales (only for non-zero elements)
+    log_scales = torch.zeros_like(clamped_tensor)
+    if nonzero_mask.any():
+        log_scales[nonzero_mask] = torch.floor(torch.log2(abs_values[nonzero_mask]) + bias).detach()
+    # Limit log scales and calculate quantization factor
+    log_scales = torch.clamp(log_scales, min=1.0)
+    quant_factor = 2.0 ** (log_scales - mantissa_bits - bias)
+    # Quantize and dequantize
+    quantized = torch.round(clamped_tensor / quant_factor) * quant_factor
+    return quantized, scale
+def optimize_state_dict_with_fp8(
+    state_dict, calc_device, target_layer_keys=None, exclude_layer_keys=None, exp_bits=4, mantissa_bits=3, move_to_device=False
+):
+    """
+    Optimize Linear layer weights in a model's state dict to FP8 format.
+    Args:
+        state_dict (dict): State dict to optimize, replaced in-place
+        calc_device (str): Device to quantize tensors on
+        target_layer_keys (list, optional): Layer key patterns to target (None for all Linear layers)
+        exclude_layer_keys (list, optional): Layer key patterns to exclude
+        exp_bits (int): Number of exponent bits
+        mantissa_bits (int): Number of mantissa bits
+        move_to_device (bool): Move optimized tensors to the calculating device
+    Returns:
+        dict: FP8 optimized state dict
+    """
+    if exp_bits == 4 and mantissa_bits == 3:
+        fp8_dtype = torch.float8_e4m3fn
+    elif exp_bits == 5 and mantissa_bits == 2:
+        fp8_dtype = torch.float8_e5m2
+    else:
+        raise ValueError(f"Unsupported FP8 format: E{exp_bits}M{mantissa_bits}")
+    # Calculate FP8 max value
+    max_value = calculate_fp8_maxval(exp_bits, mantissa_bits)
+    min_value = -max_value  # this function supports only signed FP8
+    # Create optimized state dict
+    optimized_count = 0
+    # Enumerate tarket keys
+    target_state_dict_keys = []
+    for key in state_dict.keys():
+        # Check if it's a weight key and matches target patterns
+        is_target = (target_layer_keys is None or any(pattern in key for pattern in target_layer_keys)) and key.endswith(".weight")
+        is_excluded = exclude_layer_keys is not None and any(pattern in key for pattern in exclude_layer_keys)
+        is_target = is_target and not is_excluded
+        if is_target and isinstance(state_dict[key], torch.Tensor):
+            target_state_dict_keys.append(key)
+    # Process each key
+    for key in tqdm(target_state_dict_keys):
+        value = state_dict[key]
+        # Save original device and dtype
+        original_device = value.device
+        original_dtype = value.dtype
+        # Move to calculation device
+        if calc_device is not None:
+            value = value.to(calc_device)
+        # Calculate scale factor
+        scale = torch.max(torch.abs(value.flatten())) / max_value
+        # print(f"Optimizing {key} with scale: {scale}")
+        # Quantize weight to FP8
+        quantized_weight, _ = quantize_tensor_to_fp8(value, scale, exp_bits, mantissa_bits, 1, max_value, min_value)
+        # Add to state dict using original key for weight and new key for scale
+        fp8_key = key  # Maintain original key
+        scale_key = key.replace(".weight", ".scale_weight")
+        quantized_weight = quantized_weight.to(fp8_dtype)
+        if not move_to_device:
+            quantized_weight = quantized_weight.to(original_device)
+        scale_tensor = torch.tensor([scale], dtype=original_dtype, device=quantized_weight.device)
+        state_dict[fp8_key] = quantized_weight
+        state_dict[scale_key] = scale_tensor
+        optimized_count += 1
+        if calc_device is not None:  # optimized_count % 10 == 0 and
+            # free memory on calculation device
+            clean_memory_on_device(calc_device)
+    logger.info(f"Number of optimized Linear layers: {optimized_count}")
+    return state_dict
+def fp8_linear_forward_patch(self: nn.Linear, x, use_scaled_mm=False, max_value=None):
+    """
+    Patched forward method for Linear layers with FP8 weights.
+    Args:
+        self: Linear layer instance
+        x (torch.Tensor): Input tensor
+        use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series)
+        max_value (float): Maximum value for FP8 quantization. If None, no quantization is applied for input tensor.
+    Returns:
+        torch.Tensor: Result of linear transformation
+    """
+    if use_scaled_mm:
+        input_dtype = x.dtype
+        original_weight_dtype = self.scale_weight.dtype
+        weight_dtype = self.weight.dtype
+        target_dtype = torch.float8_e5m2
+        assert weight_dtype == torch.float8_e4m3fn, "Only FP8 E4M3FN format is supported"
+        assert x.ndim == 3, "Input tensor must be 3D (batch_size, seq_len, hidden_dim)"
+        if max_value is None:
+            # no input quantization
+            scale_x = torch.tensor(1.0, dtype=torch.float32, device=x.device)
+        else:
+            # calculate scale factor for input tensor
+            scale_x = (torch.max(torch.abs(x.flatten())) / max_value).to(torch.float32)
+            # quantize input tensor to FP8: this seems to consume a lot of memory
+            x, _ = quantize_tensor_to_fp8(x, scale_x, 5, 2, 1, max_value, -max_value)
+        original_shape = x.shape
+        x = x.reshape(-1, x.shape[2]).to(target_dtype)
+        weight = self.weight.t()
+        scale_weight = self.scale_weight.to(torch.float32)
+        if self.bias is not None:
+            # float32 is not supported with bias in scaled_mm
+            o = torch._scaled_mm(x, weight, out_dtype=original_weight_dtype, bias=self.bias, scale_a=scale_x, scale_b=scale_weight)
+        else:
+            o = torch._scaled_mm(x, weight, out_dtype=input_dtype, scale_a=scale_x, scale_b=scale_weight)
+        return o.reshape(original_shape[0], original_shape[1], -1).to(input_dtype)
+    else:
+        # Dequantize the weight
+        original_dtype = self.scale_weight.dtype
+        dequantized_weight = self.weight.to(original_dtype) * self.scale_weight
+        # Perform linear transformation
+        if self.bias is not None:
+            output = F.linear(x, dequantized_weight, self.bias)
+        else:
+            output = F.linear(x, dequantized_weight)
+        return output
+def apply_fp8_monkey_patch(model, optimized_state_dict, use_scaled_mm=False):
+    """
+    Apply monkey patching to a model using FP8 optimized state dict.
+    Args:
+        model (nn.Module): Model instance to patch
+        optimized_state_dict (dict): FP8 optimized state dict
+        use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series)
+    Returns:
+        nn.Module: The patched model (same instance, modified in-place)
+    """
+    # # Calculate FP8 float8_e5m2 max value
+    # max_value = calculate_fp8_maxval(5, 2)
+    max_value = None  # do not quantize input tensor
+    # Find all scale keys to identify FP8-optimized layers
+    scale_keys = [k for k in optimized_state_dict.keys() if k.endswith(".scale_weight")]
+    # Enumerate patched layers
+    patched_module_paths = set()
+    for scale_key in scale_keys:
+        # Extract module path from scale key (remove .scale_weight)
+        module_path = scale_key.rsplit(".scale_weight", 1)[0]
+        patched_module_paths.add(module_path)
+    patched_count = 0
+    # Apply monkey patch to each layer with FP8 weights
+    for name, module in model.named_modules():
+        # Check if this module has a corresponding scale_weight
+        has_scale = name in patched_module_paths
+        # Apply patch if it's a Linear layer with FP8 scale
+        if isinstance(module, nn.Linear) and has_scale:
+            # register the scale_weight as a buffer to load the state_dict
+            module.register_buffer("scale_weight", torch.tensor(1.0, dtype=module.weight.dtype))
+            # Create a new forward method with the patched version.
+            def new_forward(self, x):
+                return fp8_linear_forward_patch(self, x, use_scaled_mm, max_value)
+            # Bind method to module
+            module.forward = new_forward.__get__(module, type(module))
+            patched_count += 1
+    logger.info(f"Number of monkey-patched Linear layers: {patched_count}")
+    return model
+# Example usage
+def example_usage():
+    # Small test model
+    class TestModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            fc1 = nn.Linear(768, 3072)
+            act1 = nn.GELU()
+            fc2 = nn.Linear(3072, 768)
+            act2 = nn.GELU()
+            fc3 = nn.Linear(768, 768)
+            # Set layer names for testing
+            self.single_blocks = nn.ModuleList([fc1, act1, fc2, act2, fc3])
+            self.fc4 = nn.Linear(768, 128)
+        def forward(self, x):
+            for layer in self.single_blocks:
+                x = layer(x)
+            x = self.fc4(x)
+            return x
+    # Instantiate model
+    test_model = TestModel()
+    test_model.to(torch.float16)  # convert to FP16 for testing
+    # Test input tensor
+    test_input = torch.randn(1, 768, dtype=torch.float16)
+    # Calculate output before optimization
+    with torch.no_grad():
+        original_output = test_model(test_input)
+        print("original output", original_output[0, :5])
+    # Get state dict
+    state_dict = test_model.state_dict()
+    # Apply FP8 optimization to state dict
+    cuda_device = torch.device("cuda")
+    optimized_state_dict = optimize_state_dict_with_fp8(state_dict, cuda_device, ["single_blocks"], ["2"])
+    # Apply monkey patching to the model
+    optimized_model = TestModel()  # re-instantiate model
+    optimized_model.to(torch.float16)  # convert to FP16 for testing
+    apply_fp8_monkey_patch(optimized_model, optimized_state_dict)
+    # Load optimized state dict
+    optimized_model.load_state_dict(optimized_state_dict, strict=True, assign=True)  # assign=True to load buffer
+    # Calculate output after optimization
+    with torch.no_grad():
+        optimized_output = optimized_model(test_input)
+        print("optimized output", optimized_output[0, :5])
+    # Compare accuracy
+    error = torch.mean(torch.abs(original_output - optimized_output))
+    print(f"Mean absolute error: {error.item()}")
+    # Check memory usage
+    original_params = sum(p.nelement() * p.element_size() for p in test_model.parameters()) / (1024 * 1024)
+    print(f"Model parameter memory: {original_params:.2f} MB")
+    optimized_params = sum(p.nelement() * p.element_size() for p in optimized_model.parameters()) / (1024 * 1024)
+    print(f"Optimized model parameter memory: {optimized_params:.2f} MB")
+    return test_model
+if __name__ == "__main__":
+    example_usage()

networks/lora.py CHANGED Viewed

@@ -8,7 +8,6 @@ import math
 import os
 import re
 from typing import Dict, List, Optional, Type, Union
-from diffusers import AutoencoderKL
 from transformers import CLIPTextModel
 import numpy as np
 import torch

 import os
 import re
 from typing import Dict, List, Optional, Type, Union
 from transformers import CLIPTextModel
 import numpy as np
 import torch

networks/lora_framepack.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# LoRA module for FramePack
+import ast
+from typing import Dict, List, Optional
+import torch
+import torch.nn as nn
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+import networks.lora as lora
+FRAMEPACK_TARGET_REPLACE_MODULES = ["HunyuanVideoTransformerBlock", "HunyuanVideoSingleTransformerBlock"]
+def create_arch_network(
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    vae: nn.Module,
+    text_encoders: List[nn.Module],
+    unet: nn.Module,
+    neuron_dropout: Optional[float] = None,
+    **kwargs,
+):
+    # add default exclude patterns
+    exclude_patterns = kwargs.get("exclude_patterns", None)
+    if exclude_patterns is None:
+        exclude_patterns = []
+    else:
+        exclude_patterns = ast.literal_eval(exclude_patterns)
+    # exclude if 'norm' in the name of the module
+    exclude_patterns.append(r".*(norm).*")
+    kwargs["exclude_patterns"] = exclude_patterns
+    return lora.create_network(
+        FRAMEPACK_TARGET_REPLACE_MODULES,
+        "lora_unet",
+        multiplier,
+        network_dim,
+        network_alpha,
+        vae,
+        text_encoders,
+        unet,
+        neuron_dropout=neuron_dropout,
+        **kwargs,
+    )
+def create_arch_network_from_weights(
+    multiplier: float,
+    weights_sd: Dict[str, torch.Tensor],
+    text_encoders: Optional[List[nn.Module]] = None,
+    unet: Optional[nn.Module] = None,
+    for_inference: bool = False,
+    **kwargs,
+) -> lora.LoRANetwork:
+    return lora.create_network_from_weights(
+        FRAMEPACK_TARGET_REPLACE_MODULES, multiplier, weights_sd, text_encoders, unet, for_inference, **kwargs
+    )

pyproject.toml CHANGED Viewed

@@ -5,13 +5,15 @@ description = "Musubi Tuner by kohya_ss"
 readme = "README.md"
 requires-python = ">=3.10, <3.11"
 dependencies = [
-    "accelerate>=1.0.0",
     "ascii-magic==2.3.0",
     "av==14.0.1",
     "bitsandbytes>=0.45.0",
     "diffusers>=0.32.1",
     "einops>=0.7.0",
-    "huggingface-hub>=0.26.5",
     "matplotlib>=3.10.0",
     "opencv-python>=4.10.0.84",
     "pillow>=10.2.0",

 readme = "README.md"
 requires-python = ">=3.10, <3.11"
 dependencies = [
+    "accelerate>=1.6.0",
     "ascii-magic==2.3.0",
     "av==14.0.1",
     "bitsandbytes>=0.45.0",
     "diffusers>=0.32.1",
+    "easydict==1.13",
     "einops>=0.7.0",
+    "ftfy==6.3.1",
+    "huggingface-hub>=0.30.0",
     "matplotlib>=3.10.0",
     "opencv-python>=4.10.0.84",
     "pillow>=10.2.0",

requirements.txt CHANGED Viewed

@@ -1,11 +1,11 @@
-accelerate==1.2.1
 av==14.0.1
-bitsandbytes==0.45.0
 diffusers==0.32.1
 einops==0.7.0
-huggingface-hub==0.26.5
 opencv-python==4.10.0.84
-pillow==10.2.0
 safetensors==0.4.5
 toml==0.10.2
 tqdm==4.67.1

+accelerate==1.6.0
 av==14.0.1
+bitsandbytes==0.45.4
 diffusers==0.32.1
 einops==0.7.0
+huggingface-hub==0.30.0
 opencv-python==4.10.0.84
+pillow
 safetensors==0.4.5
 toml==0.10.2
 tqdm==4.67.1

utils/safetensors_utils.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import torch
 import json
 import struct
@@ -169,7 +171,7 @@ class MemoryEfficientSafeOpen:
 def load_safetensors(
-    path: str, device: Union[str, torch.device], disable_mmap: bool = False, dtype: Optional[torch.dtype] = torch.float32
 ) -> dict[str, torch.Tensor]:
     if disable_mmap:
         # return safetensors.torch.load(open(path, "rb").read())
@@ -189,3 +191,31 @@ def load_safetensors(
             for key in state_dict.keys():
                 state_dict[key] = state_dict[key].to(dtype=dtype)
         return state_dict

+import os
+import re
 import torch
 import json
 import struct
 def load_safetensors(
+    path: str, device: Union[str, torch.device], disable_mmap: bool = False, dtype: Optional[torch.dtype] = None
 ) -> dict[str, torch.Tensor]:
     if disable_mmap:
         # return safetensors.torch.load(open(path, "rb").read())
             for key in state_dict.keys():
                 state_dict[key] = state_dict[key].to(dtype=dtype)
         return state_dict
+def load_split_weights(
+    file_path: str, device: Union[str, torch.device] = "cpu", disable_mmap: bool = False
+) -> Dict[str, torch.Tensor]:
+    """
+    Load split weights from a file. If the file name ends with 00001-of-00004 etc, it will load all files with the same prefix.
+    dtype is as is, no conversion is done.
+    """
+    device = torch.device(device)
+    # if the file name ends with 00001-of-00004 etc, we need to load the files with the same prefix
+    basename = os.path.basename(file_path)
+    match = re.match(r"^(.*?)(\d+)-of-(\d+)\.safetensors$", basename)
+    if match:
+        prefix = basename[: match.start(2)]
+        count = int(match.group(3))
+        state_dict = {}
+        for i in range(count):
+            filename = f"{prefix}{i+1:05d}-of-{count:05d}.safetensors"
+            filepath = os.path.join(os.path.dirname(file_path), filename)
+            if os.path.exists(filepath):
+                state_dict.update(load_safetensors(filepath, device=device, disable_mmap=disable_mmap))
+            else:
+                raise FileNotFoundError(f"File {filepath} not found")
+    else:
+        state_dict = load_safetensors(file_path, device=device, disable_mmap=disable_mmap)
+    return state_dict

utils/sai_model_spec.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import List, Optional, Tuple, Union
 import safetensors
 import logging
-from dataset.image_video_dataset import ARCHITECTURE_HUNYUAN_VIDEO, ARCHITECTURE_WAN
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -59,9 +59,13 @@ ARCH_HUNYUAN_VIDEO = "hunyuan-video"
 # Official Wan2.1 weights does not have sai_model_spec, so we use this as an architecture name
 ARCH_WAN = "wan2.1"
 ADAPTER_LORA = "lora"
 IMPL_HUNYUAN_VIDEO = "https://github.com/Tencent/HunyuanVideo"
 PRED_TYPE_EPSILON = "epsilon"
 # PRED_TYPE_V = "v"
@@ -121,8 +125,13 @@ def build_metadata(
     # arch = ARCH_HUNYUAN_VIDEO
     if architecture == ARCHITECTURE_HUNYUAN_VIDEO:
         arch = ARCH_HUNYUAN_VIDEO
     elif architecture == ARCHITECTURE_WAN:
         arch = ARCH_WAN
     else:
         raise ValueError(f"Unknown architecture: {architecture}")
@@ -130,7 +139,6 @@ def build_metadata(
         arch += f"/{ADAPTER_LORA}"
     metadata["modelspec.architecture"] = arch
-    impl = IMPL_HUNYUAN_VIDEO
     metadata["modelspec.implementation"] = impl
     if title is None:

 import safetensors
 import logging
+from dataset.image_video_dataset import ARCHITECTURE_HUNYUAN_VIDEO, ARCHITECTURE_WAN, ARCHITECTURE_FRAMEPACK
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 # Official Wan2.1 weights does not have sai_model_spec, so we use this as an architecture name
 ARCH_WAN = "wan2.1"
+ARCH_FRAMEPACK = "framepack"
 ADAPTER_LORA = "lora"
 IMPL_HUNYUAN_VIDEO = "https://github.com/Tencent/HunyuanVideo"
+IMPL_WAN = "https://github.com/Wan-Video/Wan2.1"
+IMPL_FRAMEPACK = "https://github.com/lllyasviel/FramePack"
 PRED_TYPE_EPSILON = "epsilon"
 # PRED_TYPE_V = "v"
     # arch = ARCH_HUNYUAN_VIDEO
     if architecture == ARCHITECTURE_HUNYUAN_VIDEO:
         arch = ARCH_HUNYUAN_VIDEO
+        impl = IMPL_HUNYUAN_VIDEO
     elif architecture == ARCHITECTURE_WAN:
         arch = ARCH_WAN
+        impl = IMPL_WAN
+    elif architecture == ARCHITECTURE_FRAMEPACK:
+        arch = ARCH_FRAMEPACK
+        impl = IMPL_FRAMEPACK
     else:
         raise ValueError(f"Unknown architecture: {architecture}")
         arch += f"/{ADAPTER_LORA}"
     metadata["modelspec.architecture"] = arch
     metadata["modelspec.implementation"] = impl
     if title is None:

utils/train_utils.py CHANGED Viewed

@@ -36,6 +36,7 @@ def get_sanitized_config_or_none(args: argparse.Namespace):
         "vae",
         "text_encoder1",
         "text_encoder2",
         "base_weights",
         "network_weights",
         "output_dir",

         "vae",
         "text_encoder1",
         "text_encoder2",
+        "image_encoder",
         "base_weights",
         "network_weights",
         "output_dir",

wan/__init__.py CHANGED Viewed

@@ -1,3 +1 @@
 # from . import configs, distributed, modules
-from .image2video import WanI2V
-from .text2video import WanT2V


1	# from . import configs, distributed, modules

wan/configs/__init__.py CHANGED Viewed

@@ -1,8 +1,9 @@
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import copy
 import os
-os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 from .wan_i2v_14B import i2v_14B
 from .wan_t2v_1_3B import t2v_1_3B
@@ -10,33 +11,59 @@ from .wan_t2v_14B import t2v_14B
 # the config of t2i_14B is the same as t2v_14B
 t2i_14B = copy.deepcopy(t2v_14B)
-t2i_14B.__name__ = 'Config: Wan T2I 14B'
 WAN_CONFIGS = {
-    't2v-14B': t2v_14B,
-    't2v-1.3B': t2v_1_3B,
-    'i2v-14B': i2v_14B,
-    't2i-14B': t2i_14B,
 }
 SIZE_CONFIGS = {
-    '720*1280': (720, 1280),
-    '1280*720': (1280, 720),
-    '480*832': (480, 832),
-    '832*480': (832, 480),
-    '1024*1024': (1024, 1024),
 }
 MAX_AREA_CONFIGS = {
-    '720*1280': 720 * 1280,
-    '1280*720': 1280 * 720,
-    '480*832': 480 * 832,
-    '832*480': 832 * 480,
 }
 SUPPORTED_SIZES = {
-    't2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
-    't2v-1.3B': ('480*832', '832*480'),
-    'i2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
-    't2i-14B': tuple(SIZE_CONFIGS.keys()),
 }

 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import copy
 import os
+import torch
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 from .wan_i2v_14B import i2v_14B
 from .wan_t2v_1_3B import t2v_1_3B
 # the config of t2i_14B is the same as t2v_14B
 t2i_14B = copy.deepcopy(t2v_14B)
+t2i_14B.__name__ = "Config: Wan T2I 14B"
+# support Fun models: deepcopy and change some configs. FC denotes Fun Control
+t2v_1_3B_FC = copy.deepcopy(t2v_1_3B)
+t2v_1_3B_FC.__name__ = "Config: Wan-Fun-Control T2V 1.3B"
+t2v_1_3B_FC.i2v = True  # this is strange, but Fun-Control model needs this because it has img cross-attention
+t2v_1_3B_FC.in_dim = 48
+t2v_1_3B_FC.is_fun_control = True
+t2v_14B_FC = copy.deepcopy(t2v_14B)
+t2v_14B_FC.__name__ = "Config: Wan-Fun-Control T2V 14B"
+t2v_14B_FC.i2v = True  # this is strange, but Fun-Control model needs this because it has img cross-attention
+t2v_14B_FC.in_dim = 48  # same as i2v_14B, use zeros for image latents
+t2v_14B_FC.is_fun_control = True
+i2v_14B_FC = copy.deepcopy(i2v_14B)
+i2v_14B_FC.__name__ = "Config: Wan-Fun-Control I2V 14B"
+i2v_14B_FC.in_dim = 48
+i2v_14B_FC.is_fun_control = True
 WAN_CONFIGS = {
+    "t2v-14B": t2v_14B,
+    "t2v-1.3B": t2v_1_3B,
+    "i2v-14B": i2v_14B,
+    "t2i-14B": t2i_14B,
+    # Fun Control models
+    "t2v-1.3B-FC": t2v_1_3B_FC,
+    "t2v-14B-FC": t2v_14B_FC,
+    "i2v-14B-FC": i2v_14B_FC,
 }
 SIZE_CONFIGS = {
+    "720*1280": (720, 1280),
+    "1280*720": (1280, 720),
+    "480*832": (480, 832),
+    "832*480": (832, 480),
+    "1024*1024": (1024, 1024),
 }
 MAX_AREA_CONFIGS = {
+    "720*1280": 720 * 1280,
+    "1280*720": 1280 * 720,
+    "480*832": 480 * 832,
+    "832*480": 832 * 480,
 }
 SUPPORTED_SIZES = {
+    "t2v-14B": ("720*1280", "1280*720", "480*832", "832*480"),
+    "t2v-1.3B": ("480*832", "832*480"),
+    "i2v-14B": ("720*1280", "1280*720", "480*832", "832*480"),
+    "t2i-14B": tuple(SIZE_CONFIGS.keys()),
+    # Fun Control models
+    "t2v-1.3B-FC": ("480*832", "832*480"),
+    "t2v-14B-FC": ("720*1280", "1280*720", "480*832", "832*480"),
+    "i2v-14B-FC": ("720*1280", "1280*720", "480*832", "832*480"),
 }

wan/configs/shared_config.py CHANGED Viewed

@@ -12,6 +12,7 @@ wan_shared_cfg.text_len = 512
 # transformer
 wan_shared_cfg.param_dtype = torch.bfloat16
 # inference
 wan_shared_cfg.num_train_timesteps = 1000

 # transformer
 wan_shared_cfg.param_dtype = torch.bfloat16
+wan_shared_cfg.out_dim = 16
 # inference
 wan_shared_cfg.num_train_timesteps = 1000

wan/configs/wan_i2v_14B.py CHANGED Viewed

@@ -4,22 +4,24 @@ from easydict import EasyDict
 from .shared_config import wan_shared_cfg
-#------------------------ Wan I2V 14B ------------------------#
-i2v_14B = EasyDict(__name__='Config: Wan I2V 14B')
 i2v_14B.update(wan_shared_cfg)
-i2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
-i2v_14B.t5_tokenizer = 'google/umt5-xxl'
 # clip
-i2v_14B.clip_model = 'clip_xlm_roberta_vit_h_14'
 i2v_14B.clip_dtype = torch.float16
-i2v_14B.clip_checkpoint = 'models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'
-i2v_14B.clip_tokenizer = 'xlm-roberta-large'
 # vae
-i2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
 i2v_14B.vae_stride = (4, 8, 8)
 # transformer
@@ -27,6 +29,7 @@ i2v_14B.patch_size = (1, 2, 2)
 i2v_14B.dim = 5120
 i2v_14B.ffn_dim = 13824
 i2v_14B.freq_dim = 256
 i2v_14B.num_heads = 40
 i2v_14B.num_layers = 40
 i2v_14B.window_size = (-1, -1)

 from .shared_config import wan_shared_cfg
+# ------------------------ Wan I2V 14B ------------------------#
+i2v_14B = EasyDict(__name__="Config: Wan I2V 14B")
 i2v_14B.update(wan_shared_cfg)
+i2v_14B.i2v = True
+i2v_14B.is_fun_control = False
+i2v_14B.t5_checkpoint = "models_t5_umt5-xxl-enc-bf16.pth"
+i2v_14B.t5_tokenizer = "google/umt5-xxl"
 # clip
+i2v_14B.clip_model = "clip_xlm_roberta_vit_h_14"
 i2v_14B.clip_dtype = torch.float16
+i2v_14B.clip_checkpoint = "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"
+i2v_14B.clip_tokenizer = "xlm-roberta-large"
 # vae
+i2v_14B.vae_checkpoint = "Wan2.1_VAE.pth"
 i2v_14B.vae_stride = (4, 8, 8)
 # transformer
 i2v_14B.dim = 5120
 i2v_14B.ffn_dim = 13824
 i2v_14B.freq_dim = 256
+i2v_14B.in_dim = 36
 i2v_14B.num_heads = 40
 i2v_14B.num_layers = 40
 i2v_14B.window_size = (-1, -1)

wan/configs/wan_t2v_14B.py CHANGED Viewed

@@ -3,17 +3,19 @@ from easydict import EasyDict
 from .shared_config import wan_shared_cfg
-#------------------------ Wan T2V 14B ------------------------#
-t2v_14B = EasyDict(__name__='Config: Wan T2V 14B')
 t2v_14B.update(wan_shared_cfg)
 # t5
-t2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
-t2v_14B.t5_tokenizer = 'google/umt5-xxl'
 # vae
-t2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
 t2v_14B.vae_stride = (4, 8, 8)
 # transformer
@@ -21,6 +23,7 @@ t2v_14B.patch_size = (1, 2, 2)
 t2v_14B.dim = 5120
 t2v_14B.ffn_dim = 13824
 t2v_14B.freq_dim = 256
 t2v_14B.num_heads = 40
 t2v_14B.num_layers = 40
 t2v_14B.window_size = (-1, -1)

 from .shared_config import wan_shared_cfg
+# ------------------------ Wan T2V 14B ------------------------#
+t2v_14B = EasyDict(__name__="Config: Wan T2V 14B")
 t2v_14B.update(wan_shared_cfg)
+t2v_14B.i2v = False
+t2v_14B.is_fun_control = False
 # t5
+t2v_14B.t5_checkpoint = "models_t5_umt5-xxl-enc-bf16.pth"
+t2v_14B.t5_tokenizer = "google/umt5-xxl"
 # vae
+t2v_14B.vae_checkpoint = "Wan2.1_VAE.pth"
 t2v_14B.vae_stride = (4, 8, 8)
 # transformer
 t2v_14B.dim = 5120
 t2v_14B.ffn_dim = 13824
 t2v_14B.freq_dim = 256
+t2v_14B.in_dim = 16
 t2v_14B.num_heads = 40
 t2v_14B.num_layers = 40
 t2v_14B.window_size = (-1, -1)

wan/configs/wan_t2v_1_3B.py CHANGED Viewed

@@ -3,17 +3,19 @@ from easydict import EasyDict
 from .shared_config import wan_shared_cfg
-#------------------------ Wan T2V 1.3B ------------------------#
-t2v_1_3B = EasyDict(__name__='Config: Wan T2V 1.3B')
 t2v_1_3B.update(wan_shared_cfg)
 # t5
-t2v_1_3B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
-t2v_1_3B.t5_tokenizer = 'google/umt5-xxl'
 # vae
-t2v_1_3B.vae_checkpoint = 'Wan2.1_VAE.pth'
 t2v_1_3B.vae_stride = (4, 8, 8)
 # transformer
@@ -21,6 +23,7 @@ t2v_1_3B.patch_size = (1, 2, 2)
 t2v_1_3B.dim = 1536
 t2v_1_3B.ffn_dim = 8960
 t2v_1_3B.freq_dim = 256
 t2v_1_3B.num_heads = 12
 t2v_1_3B.num_layers = 30
 t2v_1_3B.window_size = (-1, -1)

 from .shared_config import wan_shared_cfg
+# ------------------------ Wan T2V 1.3B ------------------------#
+t2v_1_3B = EasyDict(__name__="Config: Wan T2V 1.3B")
 t2v_1_3B.update(wan_shared_cfg)
+t2v_1_3B.i2v = False
+t2v_1_3B.is_fun_control = False
 # t5
+t2v_1_3B.t5_checkpoint = "models_t5_umt5-xxl-enc-bf16.pth"
+t2v_1_3B.t5_tokenizer = "google/umt5-xxl"
 # vae
+t2v_1_3B.vae_checkpoint = "Wan2.1_VAE.pth"
 t2v_1_3B.vae_stride = (4, 8, 8)
 # transformer
 t2v_1_3B.dim = 1536
 t2v_1_3B.ffn_dim = 8960
 t2v_1_3B.freq_dim = 256
+t2v_1_3B.in_dim = 16
 t2v_1_3B.num_heads = 12
 t2v_1_3B.num_layers = 30
 t2v_1_3B.window_size = (-1, -1)

wan/modules/model.py CHANGED Viewed

@@ -1,13 +1,25 @@
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import math
 import torch
 import torch.nn as nn
 from torch.utils.checkpoint import checkpoint
 from .attention import flash_attention
 from utils.device_utils import clean_memory_on_device
 from modules.custom_offloading_utils import ModelOffloader
 __all__ = ["WanModel"]
@@ -602,6 +614,40 @@ class WanModel(nn.Module):  # ModelMixin, ConfigMixin):
     def device(self):
         return next(self.parameters()).device
     def enable_gradient_checkpointing(self):
         self.gradient_checkpointing = True
@@ -661,7 +707,7 @@ class WanModel(nn.Module):  # ModelMixin, ConfigMixin):
             return
         self.offloader.prepare_block_devices_before_forward(self.blocks)
-    def forward(self, x, t, context, seq_len, clip_fea=None, y=None):
         r"""
         Forward pass through the diffusion model
@@ -683,8 +729,9 @@ class WanModel(nn.Module):  # ModelMixin, ConfigMixin):
             List[Tensor]:
                 List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
         """
-        if self.model_type == "i2v":
-            assert clip_fea is not None and y is not None
         # params
         device = self.patch_embedding.weight.device
         if self.freqs.device != device:
@@ -738,10 +785,13 @@ class WanModel(nn.Module):  # ModelMixin, ConfigMixin):
         # print(f"x: {x.shape}, e: {e0.shape}, context: {context.shape}, seq_lens: {seq_lens}")
         for block_idx, block in enumerate(self.blocks):
-            if self.blocks_to_swap:
                 self.offloader.wait_for_block(block_idx)
-            x = block(x, **kwargs)
             if self.blocks_to_swap:
                 self.offloader.submit_move_blocks_forward(self.blocks, block_idx)
@@ -801,3 +851,83 @@ class WanModel(nn.Module):  # ModelMixin, ConfigMixin):
         # init output layer
         nn.init.zeros_(self.head.head.weight)

 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import math
+from typing import Optional, Union
 import torch
 import torch.nn as nn
 from torch.utils.checkpoint import checkpoint
+from accelerate import init_empty_weights
+import logging
+from utils.safetensors_utils import MemoryEfficientSafeOpen, load_safetensors
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+from utils.device_utils import clean_memory_on_device
 from .attention import flash_attention
 from utils.device_utils import clean_memory_on_device
 from modules.custom_offloading_utils import ModelOffloader
+from modules.fp8_optimization_utils import apply_fp8_monkey_patch, optimize_state_dict_with_fp8
 __all__ = ["WanModel"]
     def device(self):
         return next(self.parameters()).device
+    def fp8_optimization(
+        self, state_dict: dict[str, torch.Tensor], device: torch.device, move_to_device: bool, use_scaled_mm: bool = False
+    ) -> int:
+        """
+        Optimize the model state_dict with fp8.
+        Args:
+            state_dict (dict[str, torch.Tensor]):
+                The state_dict of the model.
+            device (torch.device):
+                The device to calculate the weight.
+            move_to_device (bool):
+                Whether to move the weight to the device after optimization.
+        """
+        TARGET_KEYS = ["blocks"]
+        EXCLUDE_KEYS = [
+            "norm",
+            "patch_embedding",
+            "text_embedding",
+            "time_embedding",
+            "time_projection",
+            "head",
+            "modulation",
+            "img_emb",
+        ]
+        # inplace optimization
+        state_dict = optimize_state_dict_with_fp8(state_dict, device, TARGET_KEYS, EXCLUDE_KEYS, move_to_device=move_to_device)
+        # apply monkey patching
+        apply_fp8_monkey_patch(self, state_dict, use_scaled_mm=use_scaled_mm)
+        return state_dict
     def enable_gradient_checkpointing(self):
         self.gradient_checkpointing = True
             return
         self.offloader.prepare_block_devices_before_forward(self.blocks)
+    def forward(self, x, t, context, seq_len, clip_fea=None, y=None, skip_block_indices=None):
         r"""
         Forward pass through the diffusion model
             List[Tensor]:
                 List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
         """
+        # remove assertions to work with Fun-Control T2V
+        # if self.model_type == "i2v":
+        #     assert clip_fea is not None and y is not None
         # params
         device = self.patch_embedding.weight.device
         if self.freqs.device != device:
         # print(f"x: {x.shape}, e: {e0.shape}, context: {context.shape}, seq_lens: {seq_lens}")
         for block_idx, block in enumerate(self.blocks):
+            is_block_skipped = skip_block_indices is not None and block_idx in skip_block_indices
+            if self.blocks_to_swap and not is_block_skipped:
                 self.offloader.wait_for_block(block_idx)
+            if not is_block_skipped:
+                x = block(x, **kwargs)
             if self.blocks_to_swap:
                 self.offloader.submit_move_blocks_forward(self.blocks, block_idx)
         # init output layer
         nn.init.zeros_(self.head.head.weight)
+def detect_wan_sd_dtype(path: str) -> torch.dtype:
+    # get dtype from model weights
+    with MemoryEfficientSafeOpen(path) as f:
+        keys = set(f.keys())
+        key1 = "model.diffusion_model.blocks.0.cross_attn.k.weight"  # 1.3B
+        key2 = "blocks.0.cross_attn.k.weight"  # 14B
+        if key1 in keys:
+            dit_dtype = f.get_tensor(key1).dtype
+        elif key2 in keys:
+            dit_dtype = f.get_tensor(key2).dtype
+        else:
+            raise ValueError(f"Could not find the dtype in the model weights: {path}")
+    logger.info(f"Detected DiT dtype: {dit_dtype}")
+    return dit_dtype
+def load_wan_model(
+    config: any,
+    device: Union[str, torch.device],
+    dit_path: str,
+    attn_mode: str,
+    split_attn: bool,
+    loading_device: Union[str, torch.device],
+    dit_weight_dtype: Optional[torch.dtype],
+    fp8_scaled: bool = False,
+) -> WanModel:
+    # dit_weight_dtype is None for fp8_scaled
+    assert (not fp8_scaled and dit_weight_dtype is not None) or (fp8_scaled and dit_weight_dtype is None)
+    device = torch.device(device)
+    loading_device = torch.device(loading_device)
+    with init_empty_weights():
+        logger.info(f"Creating WanModel")
+        model = WanModel(
+            model_type="i2v" if config.i2v else "t2v",
+            dim=config.dim,
+            eps=config.eps,
+            ffn_dim=config.ffn_dim,
+            freq_dim=config.freq_dim,
+            in_dim=config.in_dim,
+            num_heads=config.num_heads,
+            num_layers=config.num_layers,
+            out_dim=config.out_dim,
+            text_len=config.text_len,
+            attn_mode=attn_mode,
+            split_attn=split_attn,
+        )
+        if dit_weight_dtype is not None:
+            model.to(dit_weight_dtype)
+    # if fp8_scaled, load model weights to CPU to reduce VRAM usage. Otherwise, load to the specified device (CPU for block swap or CUDA for others)
+    wan_loading_device = torch.device("cpu") if fp8_scaled else loading_device
+    logger.info(f"Loading DiT model from {dit_path}, device={wan_loading_device}, dtype={dit_weight_dtype}")
+    # load model weights with the specified dtype or as is
+    sd = load_safetensors(dit_path, wan_loading_device, disable_mmap=True, dtype=dit_weight_dtype)
+    # remove "model.diffusion_model." prefix: 1.3B model has this prefix
+    for key in list(sd.keys()):
+        if key.startswith("model.diffusion_model."):
+            sd[key[22:]] = sd.pop(key)
+    if fp8_scaled:
+        # fp8 optimization: calculate on CUDA, move back to CPU if loading_device is CPU (block swap)
+        logger.info(f"Optimizing model weights to fp8. This may take a while.")
+        sd = model.fp8_optimization(sd, device, move_to_device=loading_device.type == "cpu")
+        if loading_device.type != "cpu":
+            # make sure all the model weights are on the loading_device
+            logger.info(f"Moving weights to {loading_device}")
+            for key in sd.keys():
+                sd[key] = sd[key].to(loading_device)
+    info = model.load_state_dict(sd, strict=True, assign=True)
+    logger.info(f"Loaded DiT model from {dit_path}, info={info}")
+    return model