LPX55 commited on
Commit
d4ee98a
·
verified ·
1 Parent(s): 96fb176

Update app_v2v.py

Browse files
Files changed (1) hide show
  1. app_v2v.py +4 -36
app_v2v.py CHANGED
@@ -12,15 +12,10 @@ import safetensors.torch as sf
12
  import numpy as np
13
  import argparse
14
  import math
15
- # 20250506 pftq: Added for video input loading
16
  import decord
17
- # 20250506 pftq: Added for progress bars in video_encode
18
  from tqdm import tqdm
19
- # 20250506 pftq: Normalize file paths for Windows compatibility
20
  import pathlib
21
- # 20250506 pftq: for easier to read timestamp
22
  from datetime import datetime
23
- # 20250508 pftq: for saving prompt to mp4 comments metadata
24
  import imageio_ffmpeg
25
  import tempfile
26
  import shutil
@@ -107,7 +102,7 @@ stream = AsyncStream()
107
  outputs_folder = './outputs/'
108
  os.makedirs(outputs_folder, exist_ok=True)
109
 
110
- # 20250506 pftq: Added function to encode input video frames into latents
111
  @torch.no_grad()
112
  def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
113
  """
@@ -126,17 +121,14 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
126
  history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
127
  fps: Frames per second of the input video.
128
  """
129
- # 20250506 pftq: Normalize video path for Windows compatibility
130
  video_path = str(pathlib.Path(video_path).resolve())
131
  print(f"Processing video: {video_path}")
132
 
133
- # 20250506 pftq: Check CUDA availability and fallback to CPU if needed
134
  if device == "cuda" and not torch.cuda.is_available():
135
  print("CUDA is not available, falling back to CPU")
136
  device = "cpu"
137
 
138
  try:
139
- # 20250506 pftq: Load video and get FPS
140
  print("Initializing VideoReader...")
141
  vr = decord.VideoReader(video_path)
142
  fps = vr.get_avg_fps() # Get input video FPS
@@ -150,27 +142,22 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
150
  print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
151
  num_real_frames = num_frames
152
 
153
- # 20250506 pftq: Read frames
154
  print("Reading video frames...")
155
  frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
156
  print(f"Frames read: {frames.shape}")
157
 
158
- # 20250506 pftq: Get native video resolution
159
  native_height, native_width = frames.shape[1], frames.shape[2]
160
  print(f"Native video resolution: {native_width}x{native_height}")
161
 
162
- # 20250506 pftq: Use native resolution if height/width not specified, otherwise use provided values
163
  target_height = native_height if height is None else height
164
  target_width = native_width if width is None else width
165
 
166
- # 20250506 pftq: Adjust to nearest bucket for model compatibility
167
  if not no_resize:
168
  target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
169
  print(f"Adjusted resolution: {target_width}x{target_height}")
170
  else:
171
  print(f"Using native resolution without resizing: {target_width}x{target_height}")
172
 
173
- # 20250506 pftq: Preprocess frames to match original image processing
174
  processed_frames = []
175
  for i, frame in enumerate(frames):
176
  #print(f"Preprocessing frame {i+1}/{num_frames}")
@@ -179,10 +166,8 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
179
  processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
180
  print(f"Frames preprocessed: {processed_frames.shape}")
181
 
182
- # 20250506 pftq: Save first frame for CLIP vision encoding
183
  input_image_np = processed_frames[0]
184
 
185
- # 20250506 pftq: Convert to tensor and normalize to [-1, 1]
186
  print("Converting frames to tensor...")
187
  frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
188
  frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
@@ -190,20 +175,16 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
190
  frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
191
  print(f"Tensor shape: {frames_pt.shape}")
192
 
193
- # 20250507 pftq: Save pixel frames for use in worker
194
  input_video_pixels = frames_pt.cpu()
195
 
196
- # 20250506 pftq: Move to device
197
  print(f"Moving tensor to device: {device}")
198
  frames_pt = frames_pt.to(device)
199
  print("Tensor moved to device")
200
 
201
- # 20250506 pftq: Move VAE to device
202
  print(f"Moving VAE to device: {device}")
203
  vae.to(device)
204
  print("VAE moved to device")
205
 
206
- # 20250506 pftq: Encode frames in batches
207
  print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
208
  latents = []
209
  vae.eval()
@@ -212,15 +193,13 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
212
  #print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
213
  batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width)
214
  try:
215
- # 20250506 pftq: Log GPU memory before encoding
216
  if device == "cuda":
217
  free_mem = torch.cuda.memory_allocated() / 1024**3
218
- #print(f"GPU memory before encoding: {free_mem:.2f} GB")
219
  batch_latent = vae_encode(batch, vae)
220
- # 20250506 pftq: Synchronize CUDA to catch issues
221
  if device == "cuda":
222
  torch.cuda.synchronize()
223
- #print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
224
  latents.append(batch_latent)
225
  #print(f"Batch encoded, latent shape: {batch_latent.shape}")
226
  except RuntimeError as e:
@@ -229,16 +208,13 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
229
  print("CUDA out of memory, try reducing vae_batch_size or using CPU")
230
  raise
231
 
232
- # 20250506 pftq: Concatenate latents
233
  print("Concatenating latents...")
234
  history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
235
  print(f"History latents shape: {history_latents.shape}")
236
 
237
- # 20250506 pftq: Get first frame's latent
238
  start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
239
  print(f"Start latent shape: {start_latent.shape}")
240
 
241
- # 20250506 pftq: Move VAE back to CPU to free GPU memory
242
  if device == "cuda":
243
  vae.to(cpu)
244
  torch.cuda.empty_cache()
@@ -250,13 +226,10 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
250
  print(f"Error in video_encode: {str(e)}")
251
  raise
252
 
253
- # 20250508 pftq: for saving prompt to mp4 metadata comments
254
  def set_mp4_comments_imageio_ffmpeg(input_file, comments):
255
  try:
256
- # Get the path to the bundled FFmpeg binary from imageio-ffmpeg
257
  ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
258
 
259
- # Check if input file exists
260
  if not os.path.exists(input_file):
261
  print(f"Error: Input file {input_file} does not exist")
262
  return False
@@ -275,7 +248,6 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
275
  temp_file # temporary output file
276
  ]
277
 
278
- # Run the FFmpeg command
279
  result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
280
 
281
  if result.returncode == 0:
@@ -297,14 +269,13 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
297
  print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
298
  return False
299
 
300
- # 20250506 pftq: Modified worker to accept video input and clean frame count
301
  @torch.no_grad()
302
  def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
303
 
304
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
305
 
306
  try:
307
- # Clean GPU
308
  if not high_vram:
309
  unload_complete_models(
310
  text_encoder, text_encoder_2, image_encoder, vae, transformer
@@ -372,10 +343,8 @@ def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_
372
 
373
  rnd = torch.Generator("cpu").manual_seed(seed)
374
 
375
- # 20250506 pftq: Initialize history_latents with video latents
376
  history_latents = video_latents.cpu()
377
  total_generated_latent_frames = history_latents.shape[2]
378
- # 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
379
  history_pixels = None
380
  previous_video = None
381
 
@@ -552,7 +521,6 @@ def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_
552
  stream.output_queue.push(('end', None))
553
  return
554
 
555
- # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
556
  def process(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
557
  global stream, high_vram
558
  # 20250506 pftq: Updated assertion for video input
 
12
  import numpy as np
13
  import argparse
14
  import math
 
15
  import decord
 
16
  from tqdm import tqdm
 
17
  import pathlib
 
18
  from datetime import datetime
 
19
  import imageio_ffmpeg
20
  import tempfile
21
  import shutil
 
102
  outputs_folder = './outputs/'
103
  os.makedirs(outputs_folder, exist_ok=True)
104
 
105
+ @spaces.GPU()
106
  @torch.no_grad()
107
  def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, device="cuda", width=None, height=None):
108
  """
 
121
  history_latents: Latents of all frames (shape: [1, channels, frames, height//8, width//8]).
122
  fps: Frames per second of the input video.
123
  """
 
124
  video_path = str(pathlib.Path(video_path).resolve())
125
  print(f"Processing video: {video_path}")
126
 
 
127
  if device == "cuda" and not torch.cuda.is_available():
128
  print("CUDA is not available, falling back to CPU")
129
  device = "cpu"
130
 
131
  try:
 
132
  print("Initializing VideoReader...")
133
  vr = decord.VideoReader(video_path)
134
  fps = vr.get_avg_fps() # Get input video FPS
 
142
  print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
143
  num_real_frames = num_frames
144
 
 
145
  print("Reading video frames...")
146
  frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
147
  print(f"Frames read: {frames.shape}")
148
 
 
149
  native_height, native_width = frames.shape[1], frames.shape[2]
150
  print(f"Native video resolution: {native_width}x{native_height}")
151
 
 
152
  target_height = native_height if height is None else height
153
  target_width = native_width if width is None else width
154
 
 
155
  if not no_resize:
156
  target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
157
  print(f"Adjusted resolution: {target_width}x{target_height}")
158
  else:
159
  print(f"Using native resolution without resizing: {target_width}x{target_height}")
160
 
 
161
  processed_frames = []
162
  for i, frame in enumerate(frames):
163
  #print(f"Preprocessing frame {i+1}/{num_frames}")
 
166
  processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
167
  print(f"Frames preprocessed: {processed_frames.shape}")
168
 
 
169
  input_image_np = processed_frames[0]
170
 
 
171
  print("Converting frames to tensor...")
172
  frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
173
  frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
 
175
  frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
176
  print(f"Tensor shape: {frames_pt.shape}")
177
 
 
178
  input_video_pixels = frames_pt.cpu()
179
 
 
180
  print(f"Moving tensor to device: {device}")
181
  frames_pt = frames_pt.to(device)
182
  print("Tensor moved to device")
183
 
 
184
  print(f"Moving VAE to device: {device}")
185
  vae.to(device)
186
  print("VAE moved to device")
187
 
 
188
  print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
189
  latents = []
190
  vae.eval()
 
193
  #print(f"Encoding batch {i//vae_batch_size + 1}: frames {i} to {min(i + vae_batch_size, frames_pt.shape[2])}")
194
  batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width)
195
  try:
 
196
  if device == "cuda":
197
  free_mem = torch.cuda.memory_allocated() / 1024**3
198
+ print(f"GPU memory before encoding: {free_mem:.2f} GB")
199
  batch_latent = vae_encode(batch, vae)
 
200
  if device == "cuda":
201
  torch.cuda.synchronize()
202
+ print(f"GPU memory after encoding: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
203
  latents.append(batch_latent)
204
  #print(f"Batch encoded, latent shape: {batch_latent.shape}")
205
  except RuntimeError as e:
 
208
  print("CUDA out of memory, try reducing vae_batch_size or using CPU")
209
  raise
210
 
 
211
  print("Concatenating latents...")
212
  history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
213
  print(f"History latents shape: {history_latents.shape}")
214
 
 
215
  start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
216
  print(f"Start latent shape: {start_latent.shape}")
217
 
 
218
  if device == "cuda":
219
  vae.to(cpu)
220
  torch.cuda.empty_cache()
 
226
  print(f"Error in video_encode: {str(e)}")
227
  raise
228
 
 
229
  def set_mp4_comments_imageio_ffmpeg(input_file, comments):
230
  try:
 
231
  ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
232
 
 
233
  if not os.path.exists(input_file):
234
  print(f"Error: Input file {input_file} does not exist")
235
  return False
 
248
  temp_file # temporary output file
249
  ]
250
 
 
251
  result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
252
 
253
  if result.returncode == 0:
 
269
  print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
270
  return False
271
 
272
+ @spaces.GPU()
273
  @torch.no_grad()
274
  def worker(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
275
 
276
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
277
 
278
  try:
 
279
  if not high_vram:
280
  unload_complete_models(
281
  text_encoder, text_encoder_2, image_encoder, vae, transformer
 
343
 
344
  rnd = torch.Generator("cpu").manual_seed(seed)
345
 
 
346
  history_latents = video_latents.cpu()
347
  total_generated_latent_frames = history_latents.shape[2]
 
348
  history_pixels = None
349
  previous_video = None
350
 
 
521
  stream.output_queue.push(('end', None))
522
  return
523
 
 
524
  def process(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
525
  global stream, high_vram
526
  # 20250506 pftq: Updated assertion for video input