aaa

Running

App Files Files Community

A24005179 commited on 26 days ago

Commit

b62cca0

verified ·

1 Parent(s): b08f86e

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -54

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from PIL import Image
 from diffusers import AutoencoderKLHunyuanVideo
 from transformers import (
     LlamaModel, CLIPTextModel,
-    LlamaTokenizerFast, CLIPTokenizer
 )
 from diffusers_helper.hunyuan import (
     encode_prompt_conds, vae_decode,
@@ -28,8 +28,10 @@ from diffusers_helper.utils import (
 )
 from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
 from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
-# Remove or replace GPU-specific imports
 device = torch.device("cpu")
 # Load models
@@ -61,7 +63,8 @@ vae = AutoencoderKLHunyuanVideo.from_pretrained(
     torch_dtype=torch.float16
 ).to(device)
-feature_extractor = SiglipImageProcessor.from_pretrained(
     "lllyasviel/flux_redux_bfl",
     subfolder='feature_extractor'
 )
@@ -177,7 +180,6 @@ def worker(
     total_latent_sections = int(max(round(total_latent_sections), 1))
     job_id = generate_timestamp()
     stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
     try:
         llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
         if cfg == 1:
@@ -186,44 +188,35 @@ def worker(
             llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
         llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
         llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
         H, W, C = input_image.shape
         height, width = find_nearest_bucket(H, W, resolution=640)
         input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
         Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
         input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
         input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
         start_latent = vae_encode(input_image_pt, vae).to(device)
         image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
         image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
         llama_vec = llama_vec.to(transformer.dtype).to(device)
         llama_vec_n = llama_vec_n.to(transformer.dtype).to(device)
         clip_l_pooler = clip_l_pooler.to(transformer.dtype).to(device)
         clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype).to(device)
         image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype).to(device)
         rnd = torch.Generator("cpu").manual_seed(seed)
         history_latents = torch.zeros(
             size=(1, 16, 16 + 2 + 1, height // 8, width // 8),
             dtype=torch.float32
         ).to(device)
         history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
         total_generated_latent_frames = 1
         for section_index in range(total_latent_sections):
             if stream.input_queue.top() == 'end':
                 stream.output_queue.push(('end', None))
                 return
             if use_teacache:
                 transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
             else:
                 transformer.initialize_teacache(enable_teacache=False)
             def callback(d):
                 preview = d['denoised']
                 preview = vae_decode_fake(preview)
@@ -238,7 +231,6 @@ def worker(
                 desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}'
                 stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
                 return
             indices = torch.arange(
                 0, sum([1, 16, 2, 1, latent_window_size])
             ).unsqueeze(0)
@@ -249,7 +241,6 @@ def worker(
                 clean_latent_1x_indices,
                 latent_indices
             ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
             clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
             clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[
                 :, :, -sum([16, 2, 1]):, :, :
@@ -258,7 +249,6 @@ def worker(
                 [start_latent.to(history_latents), clean_latents_1x],
                 dim=2
             )
             generated_latents = sample_hunyuan(
                 transformer=transformer,
                 sampler='unipc',
@@ -288,10 +278,8 @@ def worker(
                 clean_latent_4x_indices=clean_latent_4x_indices,
                 callback=callback,
             )
             total_generated_latent_frames += int(generated_latents.shape[2])
             history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
             real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
             if history_pixels is None:
                 history_pixels = vae_decode(real_history_latents, vae).cpu()
@@ -304,14 +292,11 @@ def worker(
                 history_pixels = soft_append_bcthw(
                     history_pixels, current_pixels, overlapped_frames
                 )
             output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
             save_bcthw_as_mp4(history_pixels, output_filename, fps=30)
             stream.output_queue.push(('file', output_filename))
     except Exception as e:
         traceback.print_exc()
     stream.output_queue.push(('end', None))
     return
@@ -332,14 +317,17 @@ def process(
     use_teacache=True, mp4_crf=16, quality_radio="640x360", aspect_ratio="1:1"
 ):
     global stream
     quality_map = {
         "360p": (640, 360),
         "480p": (854, 480),
         "540p": (960, 540),
         "720p": (1280, 720),
-        "640x360": (640, 360),  # fallback for default
     }
-    # Aspect ratio map: (width, height)
     aspect_map = {
         "1:1": (1, 1),
         "3:4": (3, 4),
@@ -347,53 +335,36 @@ def process(
         "16:9": (16, 9),
         "9:16": (9, 16),
     }
-    selected_quality = quality_map.get(quality_radio, (640, 360))
-    base_width, base_height = selected_quality
     if t2v:
-        # Use aspect ratio to determine final width/height
         ar_w, ar_h = aspect_map.get(aspect_ratio, (1, 1))
         if ar_w >= ar_h:
-            target_height = base_height
-            target_width = int(round(target_height * ar_w / ar_h))
-        else:
-            target_width = base_width
             target_height = int(round(target_width * ar_h / ar_w))
         input_image = np.ones((target_height, target_width, 3), dtype=np.uint8) * 255
         print(f"Using blank white image for text-to-video mode, {target_width}x{target_height} ({aspect_ratio})")
     else:
-        target_width, target_height = selected_quality
-        if isinstance(input_image, dict) and "composite" in input_image:
-            composite_rgba_uint8 = input_image["composite"]
-            rgb_uint8 = composite_rgba_uint8[:, :, :3]
-            mask_uint8 = composite_rgba_uint8[:, :, 3]
-            h, w = rgb_uint8.shape[:2]
-            background_uint8 = np.full((h, w, 3), 255, dtype=np.uint8)
-            alpha_normalized_float32 = mask_uint8.astype(np.float32) / 255.0
-            alpha_mask_float32 = np.stack([alpha_normalized_float32]*3, axis=2)
-            blended_image_float32 = rgb_uint8.astype(np.float32) * alpha_mask_float32 + \
-                                background_uint8.astype(np.float32) * (1.0 - alpha_mask_float32)
-            input_image = np.clip(blended_image_float32, 0, 255).astype(np.uint8)
-        elif input_image is None:
-            raise ValueError("Please provide an input image or enable Text to Video mode")
-        else:
-            input_image = input_image.astype(np.uint8)
     yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
     stream = AsyncStream()
     async_run(
         worker, input_image, prompt, n_prompt, seed,
         total_second_length, latent_window_size, steps,
         cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf
     )
     output_filename = None
     while True:
         flag, data = stream.output_queue.next()
         if flag == 'file':
             output_filename = data
             yield (
@@ -404,7 +375,6 @@ def process(
                 gr.update(interactive=False),
                 gr.update(interactive=True)
             )
         elif flag == 'progress':
             preview, desc, html = data
             yield (
@@ -415,7 +385,6 @@ def process(
                 gr.update(interactive=False),
                 gr.update(interactive=True)
             )
         elif flag == 'end':
             yield (
                 output_filename,
@@ -430,7 +399,6 @@ def process(
 def end_process():
     stream.input_queue.push('end')
 quick_prompts = [
     'The girl dances gracefully, with clear movements, full of charm.',
     'A character doing some simple body movements.'

 from diffusers import AutoencoderKLHunyuanVideo
 from transformers import (
     LlamaModel, CLIPTextModel,
+    LlamaTokenizerFast, CLIPTokenizer, AutoImageProcessor
 )
 from diffusers_helper.hunyuan import (
     encode_prompt_conds, vae_decode,
 )
 from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
 from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
+from diffusers_helper.clip_vision import hf_clip_vision_encode
+from diffusers_helper.bucket_tools import find_nearest_bucket
+# Set device to CPU
 device = torch.device("cpu")
 # Load models
     torch_dtype=torch.float16
 ).to(device)
+# Use AutoImageProcessor instead of SiglipImageProcessor
+feature_extractor = AutoImageProcessor.from_pretrained(
     "lllyasviel/flux_redux_bfl",
     subfolder='feature_extractor'
 )
     total_latent_sections = int(max(round(total_latent_sections), 1))
     job_id = generate_timestamp()
     stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
     try:
         llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
         if cfg == 1:
             llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
         llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
         llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
         H, W, C = input_image.shape
         height, width = find_nearest_bucket(H, W, resolution=640)
         input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
         Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
         input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
         input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
         start_latent = vae_encode(input_image_pt, vae).to(device)
         image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
         image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
         llama_vec = llama_vec.to(transformer.dtype).to(device)
         llama_vec_n = llama_vec_n.to(transformer.dtype).to(device)
         clip_l_pooler = clip_l_pooler.to(transformer.dtype).to(device)
         clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype).to(device)
         image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype).to(device)
         rnd = torch.Generator("cpu").manual_seed(seed)
         history_latents = torch.zeros(
             size=(1, 16, 16 + 2 + 1, height // 8, width // 8),
             dtype=torch.float32
         ).to(device)
         history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
         total_generated_latent_frames = 1
         for section_index in range(total_latent_sections):
             if stream.input_queue.top() == 'end':
                 stream.output_queue.push(('end', None))
                 return
             if use_teacache:
                 transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
             else:
                 transformer.initialize_teacache(enable_teacache=False)
             def callback(d):
                 preview = d['denoised']
                 preview = vae_decode_fake(preview)
                 desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}'
                 stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
                 return
             indices = torch.arange(
                 0, sum([1, 16, 2, 1, latent_window_size])
             ).unsqueeze(0)
                 clean_latent_1x_indices,
                 latent_indices
             ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
             clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
             clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[
                 :, :, -sum([16, 2, 1]):, :, :
                 [start_latent.to(history_latents), clean_latents_1x],
                 dim=2
             )
             generated_latents = sample_hunyuan(
                 transformer=transformer,
                 sampler='unipc',
                 clean_latent_4x_indices=clean_latent_4x_indices,
                 callback=callback,
             )
             total_generated_latent_frames += int(generated_latents.shape[2])
             history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
             real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
             if history_pixels is None:
                 history_pixels = vae_decode(real_history_latents, vae).cpu()
                 history_pixels = soft_append_bcthw(
                     history_pixels, current_pixels, overlapped_frames
                 )
             output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
             save_bcthw_as_mp4(history_pixels, output_filename, fps=30)
             stream.output_queue.push(('file', output_filename))
     except Exception as e:
         traceback.print_exc()
     stream.output_queue.push(('end', None))
     return
     use_teacache=True, mp4_crf=16, quality_radio="640x360", aspect_ratio="1:1"
 ):
     global stream
+    # Map quality options to actual resolutions
     quality_map = {
         "360p": (640, 360),
         "480p": (854, 480),
         "540p": (960, 540),
         "720p": (1280, 720),
+        "640x360": (640, 360),  # fallback
     }
+    # Map aspect ratio strings to width/height ratios
     aspect_map = {
         "1:1": (1, 1),
         "3:4": (3, 4),
         "16:9": (16, 9),
         "9:16": (9, 16),
     }
+    # Get target resolution based on selected quality
+    target_width, target_height = quality_map.get(quality_radio, (640, 360))
     if t2v:
         ar_w, ar_h = aspect_map.get(aspect_ratio, (1, 1))
+        # Recalculate based on aspect ratio
         if ar_w >= ar_h:
             target_height = int(round(target_width * ar_h / ar_w))
+        else:
+            target_width = int(round(target_height * ar_w / ar_h))
         input_image = np.ones((target_height, target_width, 3), dtype=np.uint8) * 255
         print(f"Using blank white image for text-to-video mode, {target_width}x{target_height} ({aspect_ratio})")
     else:
+        # Resize and crop input image to match selected resolution
+        H, W, C = input_image.shape
+        height, width = find_nearest_bucket(H, W, resolution=target_width)
+        input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
+        Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
     yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
     stream = AsyncStream()
     async_run(
         worker, input_image, prompt, n_prompt, seed,
         total_second_length, latent_window_size, steps,
         cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf
     )
     output_filename = None
     while True:
         flag, data = stream.output_queue.next()
         if flag == 'file':
             output_filename = data
             yield (
                 gr.update(interactive=False),
                 gr.update(interactive=True)
             )
         elif flag == 'progress':
             preview, desc, html = data
             yield (
                 gr.update(interactive=False),
                 gr.update(interactive=True)
             )
         elif flag == 'end':
             yield (
                 output_filename,
 def end_process():
     stream.input_queue.push('end')
 quick_prompts = [
     'The girl dances gracefully, with clear movements, full of charm.',
     'A character doing some simple body movements.'