wedyanessam's picture
Update app.py
4aaf630 verified
import subprocess
import os
import sys
import shutil
from pathlib import Path
import argparse
import gradio as gr
from STT.sst import speech_to_text
from LLM.llm import generate_reply
from TTS_X.tts import generate_voice
from FantasyTalking.infer import load_models, main
# downloading of models if didn't exist
if not os.path.exists("./models/fantasytalking_model.ckpt"):
subprocess.run(["python", "download_models.py"])
sys.path.append(os.path.abspath("."))
args_template = argparse.Namespace(
fantasytalking_model_path="./models/fantasytalking_model.ckpt",
wav2vec_model_dir="./models/wav2vec2-base-960h",
wan_model_dir="./models/Wan2.1-I2V-14B-720P",
image_path="",
audio_path="",
prompt="",
output_dir="./output",
image_size=512,
audio_scale=1.0,
prompt_cfg_scale=5.0,
audio_cfg_scale=5.0,
max_num_frames=81,
inference_steps=20,
fps=23,
num_persistent_param_in_dit=None,
seed=1111
)
pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args_template)
print("✅")
def generate_video(image_path, audio_path, prompt, output_dir="./output"):
args_dict = vars(args_template).copy()
args_dict.update({
"image_path": image_path,
"audio_path": audio_path,
"prompt": prompt,
"output_dir": output_dir
})
args = argparse.Namespace(**args_dict)
return main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)
def full_pipeline(user_audio, user_image):
Path("./output").mkdir(parents=True, exist_ok=True)
video_path = generate_video(
image_path=user_image,
audio_path=user_audio,
prompt="..." # تقدر تتركه فاضي أو تكتب أي شيء بسيط
)
return "", "", user_audio, video_path
with gr.Blocks() as demo:
gr.Markdown("## Realtime Interactive Avatar 🎭")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(label="Upload Voice", type="filepath")
image_input = gr.Image(label="Upload Image", type="filepath")
btn = gr.Button("Generate")
with gr.Column():
user_text = gr.Textbox(label="Transcribed Text (Speech to Text)")
reply_text = gr.Textbox(label="Assistant Response (LLM)")
reply_audio = gr.Audio(label="Spoken Response (Text to Speech)")
video_output = gr.Video(label="Final Generated Video")
btn.click(fn=full_pipeline,
inputs=[audio_input, image_input],
outputs=[user_text, reply_text, reply_audio, video_output])
demo.launch(inbrowser=True, share=True)