Spaces:

fffiloni
/

EchoMimic

Running on Zero

App Files Files Community

fffiloni commited on Jun 4

Commit

8a37a7b

verified ·

1 Parent(s): b4b4183

MCP server ready

Browse files

Files changed (1) hide show

webgui.py +33 -1

webgui.py CHANGED Viewed

@@ -166,6 +166,38 @@ def select_face(det_bboxes, probs):
     return sorted_bboxes[0]
 def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
     if seed is not None and seed > -1:
         generator = torch.manual_seed(seed)
@@ -427,5 +459,5 @@ args = parser.parse_args()
 # demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
 if __name__ == '__main__':
-    demo.queue(max_size=3).launch(show_api=False, show_error=True, ssr_mode=False)
     #demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)

     return sorted_bboxes[0]
 def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
+    """
+    Generate a realistic lip-synced talking head video from a static reference image and a voice audio file.
+    This function takes an image of a face and an audio clip, then generates a video where the face in the image is animated to match the speech in the audio. It uses EchoMimic's pipeline with configurable parameters for generation quality, length, and face conditioning.
+    Args:
+        uploaded_img (str): Path to the input reference image. This should be a front-facing, clear image of a person's face.
+        uploaded_audio (str): Path to the WAV audio file to drive the animation. Speech audio works best.
+        width (int): Target width of the generated video frame.
+        height (int): Target height of the generated video frame.
+        length (int): Number of frames in the final output video.
+        seed (int): Random seed for reproducibility. If -1, a random seed is chosen.
+        facemask_dilation_ratio (float): Dilation ratio for expanding the face mask region.
+        facecrop_dilation_ratio (float): Dilation ratio for cropping the face region from the image.
+        context_frames (int): Number of context frames used in temporal modeling.
+        context_overlap (int): Number of overlapping frames between chunks.
+        cfg (float): Classifier-Free Guidance scale. Higher values make outputs more faithful to input conditions.
+        steps (int): Number of denoising steps in the diffusion process.
+        sample_rate (int): Audio sample rate in Hz (e.g., 16000).
+        fps (int): Frames per second in the output video.
+        device (str): Device to run the computation on ("cuda" or "cpu").
+        progress (gr.Progress): Gradio progress tracker for UI display.
+    Returns:
+        str: File path to the final output video with synchronized audio.
+    Notes:
+        - Input image should clearly show a single face, ideally centered and facing forward.
+        - Audio should be speech or vocals; music or noise may produce unpredictable results.
+        - The function trims audio to 5 seconds in shared UI mode to reduce compute time.
+        - This function is designed to work on a GPU-enabled environment for optimal performance.
+    """
     if seed is not None and seed > -1:
         generator = torch.manual_seed(seed)
 # demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
 if __name__ == '__main__':
+    demo.queue(max_size=3).launch(show_api=True, show_error=True, ssr_mode=False, mcp_server=True)
     #demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)