FramePack-Multi_AIO

Running on Zero

App Files Files Community

LPX55 commited on 28 days ago

Commit

d4ee98a

verified ·

1 Parent(s): 96fb176

Update app_v2v.py

Browse files

Files changed (1) hide show

app_v2v.py +4 -36

app_v2v.py CHANGED Viewed

@@ -12,15 +12,10 @@ import safetensors.torch as sf
 import numpy as np
 import argparse
 import math
-# 20250506 pftq: Added for video input loading
 import decord
-# 20250506 pftq: Added for progress bars in video_encode
 from tqdm import tqdm
-# 20250506 pftq: Normalize file paths for Windows compatibility
 import pathlib
-# 20250506 pftq: for easier to read timestamp
 from datetime import datetime
-# 20250508 pftq: for saving prompt to mp4 comments metadata
 import imageio_ffmpeg
 import tempfile
 import shutil
@@ -107,7 +102,7 @@ stream = AsyncStream()
 outputs_folder = './outputs/'
 os.makedirs(outputs_folder, exist_ok=True)
-# 20250506 pftq: Added function to encode input video frames into latents
 @torch.no_grad()
 def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
     """
@@ -126,17 +121,14 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
         history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
         fps: Frames per second of the input video.
     """
-    # 20250506 pftq: Normalize video path for Windows compatibility
     video_path = str(pathlib.Path(video_path).resolve())
     print(f"Processing video: {video_path}")
-    # 20250506 pftq: Check CUDA availability and fallback to CPU if needed
     if device == "cuda" and not torch.cuda.is_available():
         print("CUDA is not available, falling back to CPU")
         device = "cpu"
     try:
-        # 20250506 pftq: Load video and get FPS
         print("Initializing VideoReader...")
         vr = decord.VideoReader(video_path)
         fps = vr.get_avg_fps()  # Get input video FPS
@@ -150,27 +142,22 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
             print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
         num_real_frames = num_frames
-        # 20250506 pftq: Read frames
         print("Reading video frames...")
         frames = vr.get_batch(range(num_real_frames)).asnumpy()  # Shape: (num_real_frames, height, width, channels)
         print(f"Frames read: {frames.shape}")
-        # 20250506 pftq: Get native video resolution
         native_height, native_width = frames.shape[1], frames.shape[2]
         print(f"Native video resolution: {native_width}x{native_height}")
-        # 20250506 pftq: Use native resolution if height/width not specified, otherwise use provided values
         target_height = native_height if height is None else height
         target_width = native_width if width is None else width
-        # 20250506 pftq: Adjust to nearest bucket for model compatibility
         if not no_resize:
             target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
             print(f"Adjusted resolution: {target_width}x{target_height}")
         else:
             print(f"Using native resolution without resizing: {target_width}x{target_height}")
-        # 20250506 pftq: Preprocess frames to match original image processing
         processed_frames = []
         for i, frame in enumerate(frames):
             #print(f"Preprocessing frame {i+1}/{num_frames}")
@@ -179,10 +166,8 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
         processed_frames = np.stack(processed_frames)  # Shape: (num_real_frames, height, width, channels)
         print(f"Frames preprocessed: {processed_frames.shape}")
-        # 20250506 pftq: Save first frame for CLIP vision encoding
         input_image_np = processed_frames[0]
-        # 20250506 pftq: Convert to tensor and normalize to [-1, 1]
         print("Converting frames to tensor...")
         frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
         frames_pt = frames_pt.permute(0, 3, 1, 2)  # Shape: (num_real_frames, channels, height, width)
@@ -190,20 +175,16 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
         frames_pt = frames_pt.permute(0, 2, 1, 3, 4)  # Shape: (1, channels, num_real_frames, height, width)
         print(f"Tensor shape: {frames_pt.shape}")
-        # 20250507 pftq: Save pixel frames for use in worker
         input_video_pixels = frames_pt.cpu()
-        # 20250506 pftq: Move to device
         print(f"Moving tensor to device: {device}")
         frames_pt = frames_pt.to(device)
         print("Tensor moved to device")
-        # 20250506 pftq: Move VAE to device
         print(f"Moving VAE to device: {device}")
         vae.to(device)
         print("VAE moved to device")
-        # 20250506 pftq: Encode frames in batches
         print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
         latents = []
         vae.eval()
@@ -212,15 +193,13 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
                 #print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
                 batch = frames_pt[:, :, i:i + vae_batch_size]  # Shape: (1, channels, batch_size, height, width)
                 try:
-                    # 20250506 pftq: Log GPU memory before encoding
                     if device == "cuda":
                         free_mem = torch.cuda.memory_allocated() / 1024**3
-                        #print(f"GPU memory before encoding: {free_mem:.2f} GB")
                     batch_latent = vae_encode(batch, vae)
-                    # 20250506 pftq: Synchronize CUDA to catch issues
                     if device == "cuda":
                         torch.cuda.synchronize()
-                        #print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
                     latents.append(batch_latent)
                     #print(f"Batch encoded, latent shape: {batch_latent.shape}")
                 except RuntimeError as e:
@@ -229,16 +208,13 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
                         print("CUDA out of memory, try reducing vae_batch_size or using CPU")
                     raise
-        # 20250506 pftq: Concatenate latents
         print("Concatenating latents...")
         history_latents = torch.cat(latents, dim=2)  # Shape: (1, channels, frames, height//8, width//8)
         print(f"History latents shape: {history_latents.shape}")
-        # 20250506 pftq: Get first frame's latent
         start_latent = history_latents[:, :, :1]  # Shape: (1, channels, 1, height//8, width//8)
         print(f"Start latent shape: {start_latent.shape}")
-        # 20250506 pftq: Move VAE back to CPU to free GPU memory
         if device == "cuda":
             vae.to(cpu)
             torch.cuda.empty_cache()
@@ -250,13 +226,10 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
         print(f"Error in video_encode: {str(e)}")
         raise
-# 20250508 pftq: for saving prompt to mp4 metadata comments
 def set_mp4_comments_imageio_ffmpeg(input_file, comments):
     try:
-        # Get the path to the bundled FFmpeg binary from imageio-ffmpeg
         ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
-        # Check if input file exists
         if not os.path.exists(input_file):
             print(f"Error: Input file {input_file} does not exist")
             return False
@@ -275,7 +248,6 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
             temp_file                      # temporary output file
         ]
-        # Run the FFmpeg command
         result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
         if result.returncode == 0:
@@ -297,14 +269,13 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
         print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
         return False
-# 20250506 pftq: Modified worker to accept video input and clean frame count
 @torch.no_grad()
 def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
     stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
     try:
-        # Clean GPU
         if not high_vram:
             unload_complete_models(
                 text_encoder, text_encoder_2, image_encoder, vae, transformer
@@ -372,10 +343,8 @@ def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_
             rnd = torch.Generator("cpu").manual_seed(seed)
-            # 20250506 pftq: Initialize history_latents with video latents
             history_latents = video_latents.cpu()
             total_generated_latent_frames = history_latents.shape[2]
-            # 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
             history_pixels = None
             previous_video = None
@@ -552,7 +521,6 @@ def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_
     stream.output_queue.push(('end', None))
     return
-# 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
 def process(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
     global stream, high_vram
     # 20250506 pftq: Updated assertion for video input

 import numpy as np
 import argparse
 import math
 import decord
 from tqdm import tqdm
 import pathlib
 from datetime import datetime
 import imageio_ffmpeg
 import tempfile
 import shutil
 outputs_folder = './outputs/'
 os.makedirs(outputs_folder, exist_ok=True)
+@spaces.GPU()
 @torch.no_grad()
 def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
     """
         history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
         fps: Frames per second of the input video.
     """
     video_path = str(pathlib.Path(video_path).resolve())
     print(f"Processing video: {video_path}")
     if device == "cuda" and not torch.cuda.is_available():
         print("CUDA is not available, falling back to CPU")
         device = "cpu"
     try:
         print("Initializing VideoReader...")
         vr = decord.VideoReader(video_path)
         fps = vr.get_avg_fps()  # Get input video FPS
             print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
         num_real_frames = num_frames
         print("Reading video frames...")
         frames = vr.get_batch(range(num_real_frames)).asnumpy()  # Shape: (num_real_frames, height, width, channels)
         print(f"Frames read: {frames.shape}")
         native_height, native_width = frames.shape[1], frames.shape[2]
         print(f"Native video resolution: {native_width}x{native_height}")
         target_height = native_height if height is None else height
         target_width = native_width if width is None else width
         if not no_resize:
             target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
             print(f"Adjusted resolution: {target_width}x{target_height}")
         else:
             print(f"Using native resolution without resizing: {target_width}x{target_height}")
         processed_frames = []
         for i, frame in enumerate(frames):
             #print(f"Preprocessing frame {i+1}/{num_frames}")
         processed_frames = np.stack(processed_frames)  # Shape: (num_real_frames, height, width, channels)
         print(f"Frames preprocessed: {processed_frames.shape}")
         input_image_np = processed_frames[0]
         print("Converting frames to tensor...")
         frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
         frames_pt = frames_pt.permute(0, 3, 1, 2)  # Shape: (num_real_frames, channels, height, width)
         frames_pt = frames_pt.permute(0, 2, 1, 3, 4)  # Shape: (1, channels, num_real_frames, height, width)
         print(f"Tensor shape: {frames_pt.shape}")
         input_video_pixels = frames_pt.cpu()
         print(f"Moving tensor to device: {device}")
         frames_pt = frames_pt.to(device)
         print("Tensor moved to device")
         print(f"Moving VAE to device: {device}")
         vae.to(device)
         print("VAE moved to device")
         print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
         latents = []
         vae.eval()
                 #print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
                 batch = frames_pt[:, :, i:i + vae_batch_size]  # Shape: (1, channels, batch_size, height, width)
                 try:
                     if device == "cuda":
                         free_mem = torch.cuda.memory_allocated() / 1024**3
+                        print(f"GPU memory before encoding: {free_mem:.2f} GB")
                     batch_latent = vae_encode(batch, vae)
                     if device == "cuda":
                         torch.cuda.synchronize()
+                        print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
                     latents.append(batch_latent)
                     #print(f"Batch encoded, latent shape: {batch_latent.shape}")
                 except RuntimeError as e:
                         print("CUDA out of memory, try reducing vae_batch_size or using CPU")
                     raise
         print("Concatenating latents...")
         history_latents = torch.cat(latents, dim=2)  # Shape: (1, channels, frames, height//8, width//8)
         print(f"History latents shape: {history_latents.shape}")
         start_latent = history_latents[:, :, :1]  # Shape: (1, channels, 1, height//8, width//8)
         print(f"Start latent shape: {start_latent.shape}")
         if device == "cuda":
             vae.to(cpu)
             torch.cuda.empty_cache()
         print(f"Error in video_encode: {str(e)}")
         raise
 def set_mp4_comments_imageio_ffmpeg(input_file, comments):
     try:
         ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
         if not os.path.exists(input_file):
             print(f"Error: Input file {input_file} does not exist")
             return False
             temp_file                      # temporary output file
         ]
         result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
         if result.returncode == 0:
         print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
         return False
+@spaces.GPU()
 @torch.no_grad()
 def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
     stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
     try:
         if not high_vram:
             unload_complete_models(
                 text_encoder, text_encoder_2, image_encoder, vae, transformer
             rnd = torch.Generator("cpu").manual_seed(seed)
             history_latents = video_latents.cpu()
             total_generated_latent_frames = history_latents.shape[2]
             history_pixels = None
             previous_video = None
     stream.output_queue.push(('end', None))
     return
 def process(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
     global stream, high_vram
     # 20250506 pftq: Updated assertion for video input