fffiloni commited on
Commit
8a37a7b
·
verified ·
1 Parent(s): b4b4183

MCP server ready

Browse files
Files changed (1) hide show
  1. webgui.py +33 -1
webgui.py CHANGED
@@ -166,6 +166,38 @@ def select_face(det_bboxes, probs):
166
  return sorted_bboxes[0]
167
 
168
  def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  if seed is not None and seed > -1:
171
  generator = torch.manual_seed(seed)
@@ -427,5 +459,5 @@ args = parser.parse_args()
427
  # demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
428
 
429
  if __name__ == '__main__':
430
- demo.queue(max_size=3).launch(show_api=False, show_error=True, ssr_mode=False)
431
  #demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
 
166
  return sorted_bboxes[0]
167
 
168
  def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
169
+ """
170
+ Generate a realistic lip-synced talking head video from a static reference image and a voice audio file.
171
+
172
+ This function takes an image of a face and an audio clip, then generates a video where the face in the image is animated to match the speech in the audio. It uses EchoMimic's pipeline with configurable parameters for generation quality, length, and face conditioning.
173
+
174
+ Args:
175
+ uploaded_img (str): Path to the input reference image. This should be a front-facing, clear image of a person's face.
176
+ uploaded_audio (str): Path to the WAV audio file to drive the animation. Speech audio works best.
177
+ width (int): Target width of the generated video frame.
178
+ height (int): Target height of the generated video frame.
179
+ length (int): Number of frames in the final output video.
180
+ seed (int): Random seed for reproducibility. If -1, a random seed is chosen.
181
+ facemask_dilation_ratio (float): Dilation ratio for expanding the face mask region.
182
+ facecrop_dilation_ratio (float): Dilation ratio for cropping the face region from the image.
183
+ context_frames (int): Number of context frames used in temporal modeling.
184
+ context_overlap (int): Number of overlapping frames between chunks.
185
+ cfg (float): Classifier-Free Guidance scale. Higher values make outputs more faithful to input conditions.
186
+ steps (int): Number of denoising steps in the diffusion process.
187
+ sample_rate (int): Audio sample rate in Hz (e.g., 16000).
188
+ fps (int): Frames per second in the output video.
189
+ device (str): Device to run the computation on ("cuda" or "cpu").
190
+ progress (gr.Progress): Gradio progress tracker for UI display.
191
+
192
+ Returns:
193
+ str: File path to the final output video with synchronized audio.
194
+
195
+ Notes:
196
+ - Input image should clearly show a single face, ideally centered and facing forward.
197
+ - Audio should be speech or vocals; music or noise may produce unpredictable results.
198
+ - The function trims audio to 5 seconds in shared UI mode to reduce compute time.
199
+ - This function is designed to work on a GPU-enabled environment for optimal performance.
200
+ """
201
 
202
  if seed is not None and seed > -1:
203
  generator = torch.manual_seed(seed)
 
459
  # demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
460
 
461
  if __name__ == '__main__':
462
+ demo.queue(max_size=3).launch(show_api=True, show_error=True, ssr_mode=False, mcp_server=True)
463
  #demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)