Spaces:

rahul7star
/

Framepack-H111

Running

App Files Files Community

Framepack-H111 / frame_pack /framepack_utils.py

rahul7star

Upload 303 files

e0336bc verified 12 days ago

raw

history blame contribute delete

9.77 kB

	import os
	import logging
	from types import SimpleNamespace
	from typing import Optional, Union

	import accelerate
	from accelerate import Accelerator, init_empty_weights
	import torch
	from safetensors.torch import load_file
	from transformers import (
	LlamaTokenizerFast,
	LlamaConfig,
	LlamaModel,
	CLIPTokenizer,
	CLIPTextModel,
	CLIPConfig,
	SiglipImageProcessor,
	SiglipVisionModel,
	SiglipVisionConfig,
	)

	from utils.safetensors_utils import load_split_weights
	from hunyuan_model.vae import load_vae as hunyuan_load_vae

	import logging

	logger = logging.getLogger(__name__)
	logging.basicConfig(level=logging.INFO)


	def load_vae(
	vae_path: str, vae_chunk_size: Optional[int], vae_spatial_tile_sample_min_size: Optional[int], device: Union[str, torch.device]
	):
	# single file and directory (contains 'vae') support
	if os.path.isdir(vae_path):
	vae_path = os.path.join(vae_path, "vae", "diffusion_pytorch_model.safetensors")
	else:
	vae_path = vae_path

	vae_dtype = torch.float16 # if vae_dtype is None else str_to_dtype(vae_dtype)
	vae, _, s_ratio, t_ratio = hunyuan_load_vae(vae_dtype=vae_dtype, device=device, vae_path=vae_path)
	vae.eval()
	# vae_kwargs = {"s_ratio": s_ratio, "t_ratio": t_ratio}

	# set chunk_size to CausalConv3d recursively
	chunk_size = vae_chunk_size
	if chunk_size is not None:
	vae.set_chunk_size_for_causal_conv_3d(chunk_size)
	logger.info(f"Set chunk_size to {chunk_size} for CausalConv3d")

	if vae_spatial_tile_sample_min_size is not None:
	vae.enable_spatial_tiling(True)
	vae.tile_sample_min_size = vae_spatial_tile_sample_min_size
	vae.tile_latent_min_size = vae_spatial_tile_sample_min_size // 8
	logger.info(f"Enabled spatial tiling with min size {vae_spatial_tile_sample_min_size}")
	# elif vae_tiling:
	else:
	vae.enable_spatial_tiling(True)

	return vae


	# region Text Encoders

	# Text Encoder configs are copied from HunyuanVideo repo

	LLAMA_CONFIG = {
	"architectures": ["LlamaModel"],
	"attention_bias": False,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": 128001,
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 4096,
	"initializer_range": 0.02,
	"intermediate_size": 14336,
	"max_position_embeddings": 8192,
	"mlp_bias": False,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 32,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": None,
	"rope_theta": 500000.0,
	"tie_word_embeddings": False,
	"torch_dtype": "float16",
	"transformers_version": "4.46.3",
	"use_cache": True,
	"vocab_size": 128320,
	}

	CLIP_CONFIG = {
	# "_name_or_path": "/raid/aryan/llava-llama-3-8b-v1_1-extracted/text_encoder_2",
	"architectures": ["CLIPTextModel"],
	"attention_dropout": 0.0,
	"bos_token_id": 0,
	"dropout": 0.0,
	"eos_token_id": 2,
	"hidden_act": "quick_gelu",
	"hidden_size": 768,
	"initializer_factor": 1.0,
	"initializer_range": 0.02,
	"intermediate_size": 3072,
	"layer_norm_eps": 1e-05,
	"max_position_embeddings": 77,
	"model_type": "clip_text_model",
	"num_attention_heads": 12,
	"num_hidden_layers": 12,
	"pad_token_id": 1,
	"projection_dim": 768,
	"torch_dtype": "float16",
	"transformers_version": "4.48.0.dev0",
	"vocab_size": 49408,
	}


	def load_text_encoder1(
	args, fp8_llm: Optional[bool] = False, device: Optional[Union[str, torch.device]] = None
	) -> tuple[LlamaTokenizerFast, LlamaModel]:
	# single file, split file and directory (contains 'text_encoder') support
	logger.info(f"Loading text encoder 1 tokenizer")
	tokenizer1 = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer")

	logger.info(f"Loading text encoder 1 from {args.text_encoder1}")
	if os.path.isdir(args.text_encoder1):
	# load from directory, configs are in the directory
	text_encoder1 = LlamaModel.from_pretrained(args.text_encoder1, subfolder="text_encoder", torch_dtype=torch.float16)
	else:
	# load from file, we create the model with the appropriate config
	config = LlamaConfig(**LLAMA_CONFIG)
	with init_empty_weights():
	text_encoder1 = LlamaModel._from_config(config, torch_dtype=torch.float16)

	state_dict = load_split_weights(args.text_encoder1)

	# support weights from ComfyUI
	if "model.embed_tokens.weight" in state_dict:
	for key in list(state_dict.keys()):
	if key.startswith("model."):
	new_key = key.replace("model.", "")
	state_dict[new_key] = state_dict[key]
	del state_dict[key]
	if "tokenizer" in state_dict:
	state_dict.pop("tokenizer")
	if "lm_head.weight" in state_dict:
	state_dict.pop("lm_head.weight")

	# # support weights from ComfyUI
	# if "tokenizer" in state_dict:
	# state_dict.pop("tokenizer")

	text_encoder1.load_state_dict(state_dict, strict=True, assign=True)

	if fp8_llm:
	org_dtype = text_encoder1.dtype
	logger.info(f"Moving and casting text encoder to {device} and torch.float8_e4m3fn")
	text_encoder1.to(device=device, dtype=torch.float8_e4m3fn)

	# prepare LLM for fp8
	def prepare_fp8(llama_model: LlamaModel, target_dtype):
	def forward_hook(module):
	def forward(hidden_states):
	input_dtype = hidden_states.dtype
	hidden_states = hidden_states.to(torch.float32)
	variance = hidden_states.pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(variance + module.variance_epsilon)
	return module.weight.to(input_dtype) * hidden_states.to(input_dtype)

	return forward

	for module in llama_model.modules():
	if module.__class__.__name__ in ["Embedding"]:
	# print("set", module.__class__.__name__, "to", target_dtype)
	module.to(target_dtype)
	if module.__class__.__name__ in ["LlamaRMSNorm"]:
	# print("set", module.__class__.__name__, "hooks")
	module.forward = forward_hook(module)

	prepare_fp8(text_encoder1, org_dtype)
	else:
	text_encoder1.to(device)

	text_encoder1.eval()
	return tokenizer1, text_encoder1


	def load_text_encoder2(args) -> tuple[CLIPTokenizer, CLIPTextModel]:
	# single file and directory (contains 'text_encoder_2') support
	logger.info(f"Loading text encoder 2 tokenizer")
	tokenizer2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="tokenizer_2")

	logger.info(f"Loading text encoder 2 from {args.text_encoder2}")
	if os.path.isdir(args.text_encoder2):
	# load from directory, configs are in the directory
	text_encoder2 = CLIPTextModel.from_pretrained(args.text_encoder2, subfolder="text_encoder_2", torch_dtype=torch.float16)
	else:
	# we only have one file, so we can load it directly
	config = CLIPConfig(**CLIP_CONFIG)
	with init_empty_weights():
	text_encoder2 = CLIPTextModel._from_config(config, torch_dtype=torch.float16)

	state_dict = load_file(args.text_encoder2)

	text_encoder2.load_state_dict(state_dict, strict=True, assign=True)

	text_encoder2.eval()
	return tokenizer2, text_encoder2


	# endregion

	# region image encoder

	# Siglip configs are copied from FramePack repo
	FEATURE_EXTRACTOR_CONFIG = {
	"do_convert_rgb": None,
	"do_normalize": True,
	"do_rescale": True,
	"do_resize": True,
	"image_mean": [0.5, 0.5, 0.5],
	"image_processor_type": "SiglipImageProcessor",
	"image_std": [0.5, 0.5, 0.5],
	"processor_class": "SiglipProcessor",
	"resample": 3,
	"rescale_factor": 0.00392156862745098,
	"size": {"height": 384, "width": 384},
	}
	IMAGE_ENCODER_CONFIG = {
	"_name_or_path": "/home/lvmin/.cache/huggingface/hub/models--black-forest-labs--FLUX.1-Redux-dev/snapshots/1282f955f706b5240161278f2ef261d2a29ad649/image_encoder",
	"architectures": ["SiglipVisionModel"],
	"attention_dropout": 0.0,
	"hidden_act": "gelu_pytorch_tanh",
	"hidden_size": 1152,
	"image_size": 384,
	"intermediate_size": 4304,
	"layer_norm_eps": 1e-06,
	"model_type": "siglip_vision_model",
	"num_attention_heads": 16,
	"num_channels": 3,
	"num_hidden_layers": 27,
	"patch_size": 14,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.46.2",
	}


	def load_image_encoders(args):
	logger.info(f"Loading image encoder feature extractor")
	feature_extractor = SiglipImageProcessor(**FEATURE_EXTRACTOR_CONFIG)

	# single file, split file and directory (contains 'image_encoder') support
	logger.info(f"Loading image encoder from {args.image_encoder}")
	if os.path.isdir(args.image_encoder):
	# load from directory, configs are in the directory
	image_encoder = SiglipVisionModel.from_pretrained(args.image_encoder, subfolder="image_encoder", torch_dtype=torch.float16)
	else:
	# load from file, we create the model with the appropriate config
	config = SiglipVisionConfig(**IMAGE_ENCODER_CONFIG)
	with init_empty_weights():
	image_encoder = SiglipVisionModel._from_config(config, torch_dtype=torch.float16)

	state_dict = load_file(args.image_encoder)

	image_encoder.load_state_dict(state_dict, strict=True, assign=True)

	image_encoder.eval()
	return feature_extractor, image_encoder


	# endregion