File size: 2,617 Bytes
7cedd07
e75e1c6
5223b6a
fa13218
97e8796
 
5223b6a
bad5ae3
 
 
 
7cedd07
fa13218
 
bad5ae3
da8f7f9
 
7cedd07
1ed541f
638c139
bad5ae3
97e8796
da8f7f9
 
fa13218
97e8796
 
 
 
fa13218
97e8796
fa13218
 
 
 
 
97e8796
 
3402d0b
568d66f
bad5ae3
97e8796
bad5ae3
 
97e8796
 
93d986f
 
 
 
 
 
 
 
 
fa13218
3402d0b
bad5ae3
3402d0b
97e8796
4aaf630
3402d0b
 
4aaf630
 
3402d0b
 
4aaf630
 
3402d0b
bad5ae3
 
2c73edd
e682a2e
3402d0b
 
bad5ae3
 
 
e682a2e
3402d0b
bad5ae3
 
 
 
e682a2e
97e8796
 
3402d0b
9dd7b34
3402d0b
bad5ae3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import subprocess
import os
import sys
import shutil
from pathlib import Path
import argparse
import gradio as gr
from STT.sst import speech_to_text
from LLM.llm import generate_reply
from TTS_X.tts import generate_voice
from FantasyTalking.infer import load_models, main



# downloading of models if didn't exist
if not os.path.exists("./models/fantasytalking_model.ckpt"):
    subprocess.run(["python", "download_models.py"])


sys.path.append(os.path.abspath("."))

args_template = argparse.Namespace(
    fantasytalking_model_path="./models/fantasytalking_model.ckpt",
    wav2vec_model_dir="./models/wav2vec2-base-960h",
    wan_model_dir="./models/Wan2.1-I2V-14B-720P",
    image_path="",
    audio_path="",
    prompt="",
    output_dir="./output",
    image_size=512,
    audio_scale=1.0,
    prompt_cfg_scale=5.0,
    audio_cfg_scale=5.0,
    max_num_frames=81,
    inference_steps=20,
    fps=23,
    num_persistent_param_in_dit=None,
    seed=1111
)


pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args_template)
print("✅")


def generate_video(image_path, audio_path, prompt, output_dir="./output"):
    args_dict = vars(args_template).copy()
    args_dict.update({
        "image_path": image_path,
        "audio_path": audio_path,
        "prompt": prompt,
        "output_dir": output_dir
    })

    args = argparse.Namespace(**args_dict)
    return main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)


def full_pipeline(user_audio, user_image):
    Path("./output").mkdir(parents=True, exist_ok=True)

    video_path = generate_video(
        image_path=user_image,
        audio_path=user_audio,
        prompt="..."  # تقدر تتركه فاضي أو تكتب أي شيء بسيط
    )

    return "", "", user_audio, video_path



with gr.Blocks() as demo:
    gr.Markdown("## Realtime Interactive Avatar 🎭")

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(label="Upload Voice", type="filepath")
            image_input = gr.Image(label="Upload Image", type="filepath")
            btn = gr.Button("Generate")

        with gr.Column():
            user_text = gr.Textbox(label="Transcribed Text (Speech to Text)")
            reply_text = gr.Textbox(label="Assistant Response (LLM)")
            reply_audio = gr.Audio(label="Spoken Response (Text to Speech)")
            video_output = gr.Video(label="Final Generated Video")

    btn.click(fn=full_pipeline,
              inputs=[audio_input, image_input],
              outputs=[user_text, reply_text, reply_audio, video_output])

demo.launch(inbrowser=True, share=True)