Spaces:
Running
on
Zero
Running
on
Zero
Update app_v2v.py
Browse files- app_v2v.py +4 -36
app_v2v.py
CHANGED
@@ -12,15 +12,10 @@ import safetensors.torch as sf
|
|
12 |
import numpy as np
|
13 |
import argparse
|
14 |
import math
|
15 |
-
# 20250506 pftq: Added for video input loading
|
16 |
import decord
|
17 |
-
# 20250506 pftq: Added for progress bars in video_encode
|
18 |
from tqdm import tqdm
|
19 |
-
# 20250506 pftq: Normalize file paths for Windows compatibility
|
20 |
import pathlib
|
21 |
-
# 20250506 pftq: for easier to read timestamp
|
22 |
from datetime import datetime
|
23 |
-
# 20250508 pftq: for saving prompt to mp4 comments metadata
|
24 |
import imageio_ffmpeg
|
25 |
import tempfile
|
26 |
import shutil
|
@@ -107,7 +102,7 @@ stream = AsyncStream()
|
|
107 |
outputs_folder = './outputs/'
|
108 |
os.makedirs(outputs_folder, exist_ok=True)
|
109 |
|
110 |
-
|
111 |
@torch.no_grad()
|
112 |
def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
|
113 |
"""
|
@@ -126,17 +121,14 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
|
|
126 |
history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
|
127 |
fps: Frames per second of the input video.
|
128 |
"""
|
129 |
-
# 20250506 pftq: Normalize video path for Windows compatibility
|
130 |
video_path = str(pathlib.Path(video_path).resolve())
|
131 |
print(f"Processing video: {video_path}")
|
132 |
|
133 |
-
# 20250506 pftq: Check CUDA availability and fallback to CPU if needed
|
134 |
if device == "cuda" and not torch.cuda.is_available():
|
135 |
print("CUDA is not available, falling back to CPU")
|
136 |
device = "cpu"
|
137 |
|
138 |
try:
|
139 |
-
# 20250506 pftq: Load video and get FPS
|
140 |
print("Initializing VideoReader...")
|
141 |
vr = decord.VideoReader(video_path)
|
142 |
fps = vr.get_avg_fps() # Get input video FPS
|
@@ -150,27 +142,22 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
|
|
150 |
print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
|
151 |
num_real_frames = num_frames
|
152 |
|
153 |
-
# 20250506 pftq: Read frames
|
154 |
print("Reading video frames...")
|
155 |
frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
|
156 |
print(f"Frames read: {frames.shape}")
|
157 |
|
158 |
-
# 20250506 pftq: Get native video resolution
|
159 |
native_height, native_width = frames.shape[1], frames.shape[2]
|
160 |
print(f"Native video resolution: {native_width}x{native_height}")
|
161 |
|
162 |
-
# 20250506 pftq: Use native resolution if height/width not specified, otherwise use provided values
|
163 |
target_height = native_height if height is None else height
|
164 |
target_width = native_width if width is None else width
|
165 |
|
166 |
-
# 20250506 pftq: Adjust to nearest bucket for model compatibility
|
167 |
if not no_resize:
|
168 |
target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
|
169 |
print(f"Adjusted resolution: {target_width}x{target_height}")
|
170 |
else:
|
171 |
print(f"Using native resolution without resizing: {target_width}x{target_height}")
|
172 |
|
173 |
-
# 20250506 pftq: Preprocess frames to match original image processing
|
174 |
processed_frames = []
|
175 |
for i, frame in enumerate(frames):
|
176 |
#print(f"Preprocessing frame {i+1}/{num_frames}")
|
@@ -179,10 +166,8 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
|
|
179 |
processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
|
180 |
print(f"Frames preprocessed: {processed_frames.shape}")
|
181 |
|
182 |
-
# 20250506 pftq: Save first frame for CLIP vision encoding
|
183 |
input_image_np = processed_frames[0]
|
184 |
|
185 |
-
# 20250506 pftq: Convert to tensor and normalize to [-1, 1]
|
186 |
print("Converting frames to tensor...")
|
187 |
frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
|
188 |
frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
|
@@ -190,20 +175,16 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
|
|
190 |
frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
|
191 |
print(f"Tensor shape: {frames_pt.shape}")
|
192 |
|
193 |
-
# 20250507 pftq: Save pixel frames for use in worker
|
194 |
input_video_pixels = frames_pt.cpu()
|
195 |
|
196 |
-
# 20250506 pftq: Move to device
|
197 |
print(f"Moving tensor to device: {device}")
|
198 |
frames_pt = frames_pt.to(device)
|
199 |
print("Tensor moved to device")
|
200 |
|
201 |
-
# 20250506 pftq: Move VAE to device
|
202 |
print(f"Moving VAE to device: {device}")
|
203 |
vae.to(device)
|
204 |
print("VAE moved to device")
|
205 |
|
206 |
-
# 20250506 pftq: Encode frames in batches
|
207 |
print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
|
208 |
latents = []
|
209 |
vae.eval()
|
@@ -212,15 +193,13 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
|
|
212 |
#print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
|
213 |
batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width)
|
214 |
try:
|
215 |
-
# 20250506 pftq: Log GPU memory before encoding
|
216 |
if device == "cuda":
|
217 |
free_mem = torch.cuda.memory_allocated() / 1024**3
|
218 |
-
|
219 |
batch_latent = vae_encode(batch, vae)
|
220 |
-
# 20250506 pftq: Synchronize CUDA to catch issues
|
221 |
if device == "cuda":
|
222 |
torch.cuda.synchronize()
|
223 |
-
|
224 |
latents.append(batch_latent)
|
225 |
#print(f"Batch encoded, latent shape: {batch_latent.shape}")
|
226 |
except RuntimeError as e:
|
@@ -229,16 +208,13 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
|
|
229 |
print("CUDA out of memory, try reducing vae_batch_size or using CPU")
|
230 |
raise
|
231 |
|
232 |
-
# 20250506 pftq: Concatenate latents
|
233 |
print("Concatenating latents...")
|
234 |
history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
|
235 |
print(f"History latents shape: {history_latents.shape}")
|
236 |
|
237 |
-
# 20250506 pftq: Get first frame's latent
|
238 |
start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
|
239 |
print(f"Start latent shape: {start_latent.shape}")
|
240 |
|
241 |
-
# 20250506 pftq: Move VAE back to CPU to free GPU memory
|
242 |
if device == "cuda":
|
243 |
vae.to(cpu)
|
244 |
torch.cuda.empty_cache()
|
@@ -250,13 +226,10 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
|
|
250 |
print(f"Error in video_encode: {str(e)}")
|
251 |
raise
|
252 |
|
253 |
-
# 20250508 pftq: for saving prompt to mp4 metadata comments
|
254 |
def set_mp4_comments_imageio_ffmpeg(input_file, comments):
|
255 |
try:
|
256 |
-
# Get the path to the bundled FFmpeg binary from imageio-ffmpeg
|
257 |
ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
|
258 |
|
259 |
-
# Check if input file exists
|
260 |
if not os.path.exists(input_file):
|
261 |
print(f"Error: Input file {input_file} does not exist")
|
262 |
return False
|
@@ -275,7 +248,6 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
|
|
275 |
temp_file # temporary output file
|
276 |
]
|
277 |
|
278 |
-
# Run the FFmpeg command
|
279 |
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
280 |
|
281 |
if result.returncode == 0:
|
@@ -297,14 +269,13 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
|
|
297 |
print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
|
298 |
return False
|
299 |
|
300 |
-
|
301 |
@torch.no_grad()
|
302 |
def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
|
303 |
|
304 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
305 |
|
306 |
try:
|
307 |
-
# Clean GPU
|
308 |
if not high_vram:
|
309 |
unload_complete_models(
|
310 |
text_encoder, text_encoder_2, image_encoder, vae, transformer
|
@@ -372,10 +343,8 @@ def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_
|
|
372 |
|
373 |
rnd = torch.Generator("cpu").manual_seed(seed)
|
374 |
|
375 |
-
# 20250506 pftq: Initialize history_latents with video latents
|
376 |
history_latents = video_latents.cpu()
|
377 |
total_generated_latent_frames = history_latents.shape[2]
|
378 |
-
# 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
|
379 |
history_pixels = None
|
380 |
previous_video = None
|
381 |
|
@@ -552,7 +521,6 @@ def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_
|
|
552 |
stream.output_queue.push(('end', None))
|
553 |
return
|
554 |
|
555 |
-
# 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
|
556 |
def process(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
|
557 |
global stream, high_vram
|
558 |
# 20250506 pftq: Updated assertion for video input
|
|
|
12 |
import numpy as np
|
13 |
import argparse
|
14 |
import math
|
|
|
15 |
import decord
|
|
|
16 |
from tqdm import tqdm
|
|
|
17 |
import pathlib
|
|
|
18 |
from datetime import datetime
|
|
|
19 |
import imageio_ffmpeg
|
20 |
import tempfile
|
21 |
import shutil
|
|
|
102 |
outputs_folder = './outputs/'
|
103 |
os.makedirs(outputs_folder, exist_ok=True)
|
104 |
|
105 |
+
@spaces.GPU()
|
106 |
@torch.no_grad()
|
107 |
def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
|
108 |
"""
|
|
|
121 |
history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
|
122 |
fps: Frames per second of the input video.
|
123 |
"""
|
|
|
124 |
video_path = str(pathlib.Path(video_path).resolve())
|
125 |
print(f"Processing video: {video_path}")
|
126 |
|
|
|
127 |
if device == "cuda" and not torch.cuda.is_available():
|
128 |
print("CUDA is not available, falling back to CPU")
|
129 |
device = "cpu"
|
130 |
|
131 |
try:
|
|
|
132 |
print("Initializing VideoReader...")
|
133 |
vr = decord.VideoReader(video_path)
|
134 |
fps = vr.get_avg_fps() # Get input video FPS
|
|
|
142 |
print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
|
143 |
num_real_frames = num_frames
|
144 |
|
|
|
145 |
print("Reading video frames...")
|
146 |
frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
|
147 |
print(f"Frames read: {frames.shape}")
|
148 |
|
|
|
149 |
native_height, native_width = frames.shape[1], frames.shape[2]
|
150 |
print(f"Native video resolution: {native_width}x{native_height}")
|
151 |
|
|
|
152 |
target_height = native_height if height is None else height
|
153 |
target_width = native_width if width is None else width
|
154 |
|
|
|
155 |
if not no_resize:
|
156 |
target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
|
157 |
print(f"Adjusted resolution: {target_width}x{target_height}")
|
158 |
else:
|
159 |
print(f"Using native resolution without resizing: {target_width}x{target_height}")
|
160 |
|
|
|
161 |
processed_frames = []
|
162 |
for i, frame in enumerate(frames):
|
163 |
#print(f"Preprocessing frame {i+1}/{num_frames}")
|
|
|
166 |
processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
|
167 |
print(f"Frames preprocessed: {processed_frames.shape}")
|
168 |
|
|
|
169 |
input_image_np = processed_frames[0]
|
170 |
|
|
|
171 |
print("Converting frames to tensor...")
|
172 |
frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
|
173 |
frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
|
|
|
175 |
frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
|
176 |
print(f"Tensor shape: {frames_pt.shape}")
|
177 |
|
|
|
178 |
input_video_pixels = frames_pt.cpu()
|
179 |
|
|
|
180 |
print(f"Moving tensor to device: {device}")
|
181 |
frames_pt = frames_pt.to(device)
|
182 |
print("Tensor moved to device")
|
183 |
|
|
|
184 |
print(f"Moving VAE to device: {device}")
|
185 |
vae.to(device)
|
186 |
print("VAE moved to device")
|
187 |
|
|
|
188 |
print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
|
189 |
latents = []
|
190 |
vae.eval()
|
|
|
193 |
#print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
|
194 |
batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width)
|
195 |
try:
|
|
|
196 |
if device == "cuda":
|
197 |
free_mem = torch.cuda.memory_allocated() / 1024**3
|
198 |
+
print(f"GPU memory before encoding: {free_mem:.2f} GB")
|
199 |
batch_latent = vae_encode(batch, vae)
|
|
|
200 |
if device == "cuda":
|
201 |
torch.cuda.synchronize()
|
202 |
+
print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
|
203 |
latents.append(batch_latent)
|
204 |
#print(f"Batch encoded, latent shape: {batch_latent.shape}")
|
205 |
except RuntimeError as e:
|
|
|
208 |
print("CUDA out of memory, try reducing vae_batch_size or using CPU")
|
209 |
raise
|
210 |
|
|
|
211 |
print("Concatenating latents...")
|
212 |
history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
|
213 |
print(f"History latents shape: {history_latents.shape}")
|
214 |
|
|
|
215 |
start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
|
216 |
print(f"Start latent shape: {start_latent.shape}")
|
217 |
|
|
|
218 |
if device == "cuda":
|
219 |
vae.to(cpu)
|
220 |
torch.cuda.empty_cache()
|
|
|
226 |
print(f"Error in video_encode: {str(e)}")
|
227 |
raise
|
228 |
|
|
|
229 |
def set_mp4_comments_imageio_ffmpeg(input_file, comments):
|
230 |
try:
|
|
|
231 |
ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
|
232 |
|
|
|
233 |
if not os.path.exists(input_file):
|
234 |
print(f"Error: Input file {input_file} does not exist")
|
235 |
return False
|
|
|
248 |
temp_file # temporary output file
|
249 |
]
|
250 |
|
|
|
251 |
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
252 |
|
253 |
if result.returncode == 0:
|
|
|
269 |
print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
|
270 |
return False
|
271 |
|
272 |
+
@spaces.GPU()
|
273 |
@torch.no_grad()
|
274 |
def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
|
275 |
|
276 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
277 |
|
278 |
try:
|
|
|
279 |
if not high_vram:
|
280 |
unload_complete_models(
|
281 |
text_encoder, text_encoder_2, image_encoder, vae, transformer
|
|
|
343 |
|
344 |
rnd = torch.Generator("cpu").manual_seed(seed)
|
345 |
|
|
|
346 |
history_latents = video_latents.cpu()
|
347 |
total_generated_latent_frames = history_latents.shape[2]
|
|
|
348 |
history_pixels = None
|
349 |
previous_video = None
|
350 |
|
|
|
521 |
stream.output_queue.push(('end', None))
|
522 |
return
|
523 |
|
|
|
524 |
def process(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
|
525 |
global stream, high_vram
|
526 |
# 20250506 pftq: Updated assertion for video input
|