Spaces:
Running
on
Zero
Running
on
Zero
MCP server ready
Browse files
webgui.py
CHANGED
@@ -166,6 +166,38 @@ def select_face(det_bboxes, probs):
|
|
166 |
return sorted_bboxes[0]
|
167 |
|
168 |
def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
if seed is not None and seed > -1:
|
171 |
generator = torch.manual_seed(seed)
|
@@ -427,5 +459,5 @@ args = parser.parse_args()
|
|
427 |
# demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|
428 |
|
429 |
if __name__ == '__main__':
|
430 |
-
demo.queue(max_size=3).launch(show_api=
|
431 |
#demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|
|
|
166 |
return sorted_bboxes[0]
|
167 |
|
168 |
def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
|
169 |
+
"""
|
170 |
+
Generate a realistic lip-synced talking head video from a static reference image and a voice audio file.
|
171 |
+
|
172 |
+
This function takes an image of a face and an audio clip, then generates a video where the face in the image is animated to match the speech in the audio. It uses EchoMimic's pipeline with configurable parameters for generation quality, length, and face conditioning.
|
173 |
+
|
174 |
+
Args:
|
175 |
+
uploaded_img (str): Path to the input reference image. This should be a front-facing, clear image of a person's face.
|
176 |
+
uploaded_audio (str): Path to the WAV audio file to drive the animation. Speech audio works best.
|
177 |
+
width (int): Target width of the generated video frame.
|
178 |
+
height (int): Target height of the generated video frame.
|
179 |
+
length (int): Number of frames in the final output video.
|
180 |
+
seed (int): Random seed for reproducibility. If -1, a random seed is chosen.
|
181 |
+
facemask_dilation_ratio (float): Dilation ratio for expanding the face mask region.
|
182 |
+
facecrop_dilation_ratio (float): Dilation ratio for cropping the face region from the image.
|
183 |
+
context_frames (int): Number of context frames used in temporal modeling.
|
184 |
+
context_overlap (int): Number of overlapping frames between chunks.
|
185 |
+
cfg (float): Classifier-Free Guidance scale. Higher values make outputs more faithful to input conditions.
|
186 |
+
steps (int): Number of denoising steps in the diffusion process.
|
187 |
+
sample_rate (int): Audio sample rate in Hz (e.g., 16000).
|
188 |
+
fps (int): Frames per second in the output video.
|
189 |
+
device (str): Device to run the computation on ("cuda" or "cpu").
|
190 |
+
progress (gr.Progress): Gradio progress tracker for UI display.
|
191 |
+
|
192 |
+
Returns:
|
193 |
+
str: File path to the final output video with synchronized audio.
|
194 |
+
|
195 |
+
Notes:
|
196 |
+
- Input image should clearly show a single face, ideally centered and facing forward.
|
197 |
+
- Audio should be speech or vocals; music or noise may produce unpredictable results.
|
198 |
+
- The function trims audio to 5 seconds in shared UI mode to reduce compute time.
|
199 |
+
- This function is designed to work on a GPU-enabled environment for optimal performance.
|
200 |
+
"""
|
201 |
|
202 |
if seed is not None and seed > -1:
|
203 |
generator = torch.manual_seed(seed)
|
|
|
459 |
# demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|
460 |
|
461 |
if __name__ == '__main__':
|
462 |
+
demo.queue(max_size=3).launch(show_api=True, show_error=True, ssr_mode=False, mcp_server=True)
|
463 |
#demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|