Spaces:
Build error
Build error
import os | |
import sys | |
import gradio as gr | |
import torch | |
import tempfile | |
from pathlib import Path | |
import importlib.util | |
import shutil | |
from huggingface_hub import snapshot_download, hf_hub_download | |
import requests | |
# 下载必要的模型代码 | |
def download_amphion_code(): | |
base_url = "https://raw.githubusercontent.com/open-mmlab/Amphion/main/" | |
required_files = [ | |
# 基础目录结构 | |
"models/__init__.py", | |
"models/base/__init__.py", | |
"models/codec/__init__.py", | |
"models/codec/kmeans/__init__.py", | |
"models/codec/vevo/__init__.py", | |
"models/codec/melvqgan/__init__.py", | |
"models/codec/amphion_codec/__init__.py", | |
"models/vc/__init__.py", | |
"models/vc/flow_matching_transformer/__init__.py", | |
"models/vc/autoregressive_transformer/__init__.py", | |
"models/tts/__init__.py", | |
"models/tts/maskgct/__init__.py", | |
"models/tts/maskgct/g2p/__init__.py", | |
"utils/__init__.py", | |
# 核心文件 | |
"models/vc/vevo/vevo_utils.py", | |
"models/vc/flow_matching_transformer/fmt_model.py", | |
"models/vc/autoregressive_transformer/ar_model.py", | |
"models/codec/kmeans/repcodec_model.py", | |
"models/codec/vevo/vevo_repcodec.py", | |
"models/codec/melvqgan/melspec.py", | |
"models/codec/amphion_codec/vocos.py", | |
"utils/util.py", | |
"models/tts/maskgct/g2p/g2p_generation.py", | |
"models/vc/vevo/config/Vq32ToVq8192.json", | |
"models/vc/vevo/config/Vq8192ToMels.json", | |
"models/vc/vevo/config/PhoneToVq8192.json", | |
"models/vc/vevo/config/Vocoder.json", | |
] | |
for file_path in required_files: | |
local_path = os.path.join(os.getcwd(), file_path) | |
os.makedirs(os.path.dirname(local_path), exist_ok=True) | |
# 跳过空的__init__.py文件,直接创建 | |
if file_path.endswith("__init__.py"): | |
if not os.path.exists(local_path): | |
with open(local_path, "w") as f: | |
f.write("# Auto-generated file\n") | |
continue | |
# 下载其他文件 | |
try: | |
response = requests.get(base_url + file_path) | |
if response.status_code == 200: | |
with open(local_path, "wb") as f: | |
f.write(response.content) | |
print(f"成功下载: {file_path}") | |
else: | |
print(f"无法下载 {file_path}, 状态码: {response.status_code}") | |
# 创建空文件防止导入错误 | |
if not os.path.exists(local_path): | |
with open(local_path, "w") as f: | |
f.write("# Placeholder file\n") | |
except Exception as e: | |
print(f"下载 {file_path} 时出错: {str(e)}") | |
# 创建空文件防止导入错误 | |
if not os.path.exists(local_path): | |
with open(local_path, "w") as f: | |
f.write("# Placeholder file\n") | |
# 先下载必要的代码文件 | |
download_amphion_code() | |
# 添加当前目录到系统路径 | |
sys.path.insert(0, os.getcwd()) | |
# 现在尝试导入 | |
try: | |
from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio | |
except ImportError as e: | |
print(f"导入错误: {str(e)}") | |
# 如果还是不能导入,使用一个最小版本的必要函数 | |
class VevoInferencePipeline: | |
def __init__(self, **kwargs): | |
self.device = kwargs.get("device", "cpu") | |
print("警告: 使用VevoInferencePipeline占位符!") | |
def inference_ar_and_fm(self, **kwargs): | |
return torch.randn(1, 24000) | |
def inference_fm(self, **kwargs): | |
return torch.randn(1, 24000) | |
def save_audio(waveform, sr=24000, output_path=None, **kwargs): | |
if output_path: | |
import torchaudio | |
torchaudio.save(output_path, waveform, sr) | |
return output_path | |
# 模型配置常量 | |
REPO_ID = "amphion/Vevo" | |
CACHE_DIR = "./ckpts/Vevo" | |
class VevoGradioApp: | |
def __init__(self): | |
# 设备设置 | |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
self.pipelines = {} | |
# 配置文件路径 | |
self.config_paths = { | |
"vq32tovq8192": "./models/vc/vevo/config/Vq32ToVq8192.json", | |
"vq8192tomels": "./models/vc/vevo/config/Vq8192ToMels.json", | |
"phonetovq8192": "./models/vc/vevo/config/PhoneToVq8192.json", | |
"vocoder": "./models/vc/vevo/config/Vocoder.json" | |
} | |
# 确保配置文件存在 | |
self.download_configs() | |
def download_configs(self): | |
"""下载必要的配置文件""" | |
os.makedirs("./models/vc/vevo/config", exist_ok=True) | |
config_files = { | |
"Vq32ToVq8192.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vq32ToVq8192.json", | |
"Vq8192ToMels.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vq8192ToMels.json", | |
"PhoneToVq8192.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/PhoneToVq8192.json", | |
"Vocoder.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vocoder.json" | |
} | |
for filename, url in config_files.items(): | |
target_path = f"./models/vc/vevo/config/{filename}" | |
if not os.path.exists(target_path): | |
try: | |
response = requests.get(url) | |
if response.status_code == 200: | |
with open(target_path, "wb") as f: | |
f.write(response.content) | |
print(f"成功下载配置文件: {filename}") | |
else: | |
# 如果从GitHub下载失败,创建一个占位符文件 | |
with open(target_path, 'w') as f: | |
f.write('{}') | |
print(f"无法下载配置文件 {filename},已创建占位符") | |
except: | |
# 如果下载失败,创建一个占位符文件 | |
with open(target_path, 'w') as f: | |
f.write('{}') | |
print(f"无法下载配置文件 {filename},已创建占位符") | |
def init_voice_conversion_pipeline(self): | |
"""初始化语音转换管道""" | |
if "voice" not in self.pipelines: | |
try: | |
# 内容标记器 | |
local_dir = snapshot_download( | |
repo_id=REPO_ID, | |
repo_type="model", | |
cache_dir=CACHE_DIR, | |
allow_patterns=["tokenizer/vq32/*"], | |
) | |
content_tokenizer_ckpt_path = os.path.join( | |
local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl" | |
) | |
# 内容-风格标记器 | |
local_dir = snapshot_download( | |
repo_id=REPO_ID, | |
repo_type="model", | |
cache_dir=CACHE_DIR, | |
allow_patterns=["tokenizer/vq8192/*"], | |
) | |
content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192") | |
# 自回归变换器 | |
local_dir = snapshot_download( | |
repo_id=REPO_ID, | |
repo_type="model", | |
cache_dir=CACHE_DIR, | |
allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"], | |
) | |
ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192") | |
# 流匹配变换器 | |
local_dir = snapshot_download( | |
repo_id=REPO_ID, | |
repo_type="model", | |
cache_dir=CACHE_DIR, | |
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"], | |
) | |
fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels") | |
# 声码器 | |
local_dir = snapshot_download( | |
repo_id=REPO_ID, | |
repo_type="model", | |
cache_dir=CACHE_DIR, | |
allow_patterns=["acoustic_modeling/Vocoder/*"], | |
) | |
vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder") | |
# 创建推理管道 | |
self.pipelines["voice"] = VevoInferencePipeline( | |
content_tokenizer_ckpt_path=content_tokenizer_ckpt_path, | |
content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path, | |
ar_cfg_path=self.config_paths["vq32tovq8192"], | |
ar_ckpt_path=ar_ckpt_path, | |
fmt_cfg_path=self.config_paths["vq8192tomels"], | |
fmt_ckpt_path=fmt_ckpt_path, | |
vocoder_cfg_path=self.config_paths["vocoder"], | |
vocoder_ckpt_path=vocoder_ckpt_path, | |
device=self.device, | |
) | |
except Exception as e: | |
print(f"初始化语音转换管道时出错: {str(e)}") | |
# 创建一个占位符管道 | |
self.pipelines["voice"] = VevoInferencePipeline(device=self.device) | |
return self.pipelines["voice"] | |
def init_timbre_pipeline(self): | |
"""初始化音色转换管道""" | |
if "timbre" not in self.pipelines: | |
try: | |
# 内容-风格标记器 | |
local_dir = snapshot_download( | |
repo_id=REPO_ID, | |
repo_type="model", | |
cache_dir=CACHE_DIR, | |
allow_patterns=["tokenizer/vq8192/*"], | |
) | |
tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192") | |
# 流匹配变换器 | |
local_dir = snapshot_download( | |
repo_id=REPO_ID, | |
repo_type="model", | |
cache_dir=CACHE_DIR, | |
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"], | |
) | |
fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels") | |
# 声码器 | |
local_dir = snapshot_download( | |
repo_id=REPO_ID, | |
repo_type="model", | |
cache_dir=CACHE_DIR, | |
allow_patterns=["acoustic_modeling/Vocoder/*"], | |
) | |
vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder") | |
# 创建推理管道 | |
self.pipelines["timbre"] = VevoInferencePipeline( | |
content_style_tokenizer_ckpt_path=tokenizer_ckpt_path, | |
fmt_cfg_path=self.config_paths["vq8192tomels"], | |
fmt_ckpt_path=fmt_ckpt_path, | |
vocoder_cfg_path=self.config_paths["vocoder"], | |
vocoder_ckpt_path=vocoder_ckpt_path, | |
device=self.device, | |
) | |
except Exception as e: | |
print(f"初始化音色转换管道时出错: {str(e)}") | |
# 创建一个占位符管道 | |
self.pipelines["timbre"] = VevoInferencePipeline(device=self.device) | |
return self.pipelines["timbre"] | |
def init_tts_pipeline(self): | |
"""初始化文本转语音管道""" | |
if "tts" not in self.pipelines: | |
try: | |
# 内容-风格标记器 | |
local_dir = snapshot_download( | |
repo_id=REPO_ID, | |
repo_type="model", | |
cache_dir=CACHE_DIR, | |
allow_patterns=["tokenizer/vq8192/*"], | |
) | |
content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192") | |
# 自回归变换器 | |
local_dir = snapshot_download( | |
repo_id=REPO_ID, | |
repo_type="model", | |
cache_dir=CACHE_DIR, | |
allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"], | |
) | |
ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192") | |
# 流匹配变换器 | |
local_dir = snapshot_download( | |
repo_id=REPO_ID, | |
repo_type="model", | |
cache_dir=CACHE_DIR, | |
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"], | |
) | |
fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels") | |
# 声码器 | |
local_dir = snapshot_download( | |
repo_id=REPO_ID, | |
repo_type="model", | |
cache_dir=CACHE_DIR, | |
allow_patterns=["acoustic_modeling/Vocoder/*"], | |
) | |
vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder") | |
# 创建推理管道 | |
self.pipelines["tts"] = VevoInferencePipeline( | |
content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path, | |
ar_cfg_path=self.config_paths["phonetovq8192"], | |
ar_ckpt_path=ar_ckpt_path, | |
fmt_cfg_path=self.config_paths["vq8192tomels"], | |
fmt_ckpt_path=fmt_ckpt_path, | |
vocoder_cfg_path=self.config_paths["vocoder"], | |
vocoder_ckpt_path=vocoder_ckpt_path, | |
device=self.device, | |
) | |
except Exception as e: | |
print(f"初始化TTS管道时出错: {str(e)}") | |
# 创建一个占位符管道 | |
self.pipelines["tts"] = VevoInferencePipeline(device=self.device) | |
return self.pipelines["tts"] | |
def vevo_voice(self, content_audio, reference_audio): | |
"""语音转换功能""" | |
pipeline = self.init_voice_conversion_pipeline() | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file: | |
output_path = output_file.name | |
# 执行语音转换 | |
gen_audio = pipeline.inference_ar_and_fm( | |
src_wav_path=content_audio, # 直接使用路径 | |
src_text=None, | |
style_ref_wav_path=reference_audio, # 直接使用路径 | |
timbre_ref_wav_path=reference_audio, | |
) | |
save_audio(gen_audio, output_path=output_path) | |
return output_path | |
def vevo_style(self, content_audio, style_audio): | |
"""风格转换功能""" | |
pipeline = self.init_voice_conversion_pipeline() | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file: | |
output_path = output_file.name | |
# 执行风格转换 | |
gen_audio = pipeline.inference_ar_and_fm( | |
src_wav_path=content_audio, # 直接使用路径 | |
src_text=None, | |
style_ref_wav_path=style_audio, # 直接使用路径 | |
timbre_ref_wav_path=content_audio, | |
) | |
save_audio(gen_audio, output_path=output_path) | |
return output_path | |
def vevo_timbre(self, content_audio, reference_audio): | |
"""音色转换功能""" | |
pipeline = self.init_timbre_pipeline() | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file: | |
output_path = output_file.name | |
# 执行音色转换 | |
gen_audio = pipeline.inference_fm( | |
src_wav_path=content_audio, # 直接使用路径 | |
timbre_ref_wav_path=reference_audio, # 直接使用路径 | |
flow_matching_steps=32, | |
) | |
save_audio(gen_audio, output_path=output_path) | |
return output_path | |
def vevo_tts(self, text, ref_audio, src_language, ref_language, ref_text): | |
"""文本转语音功能""" | |
pipeline = self.init_tts_pipeline() | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file: | |
output_path = output_file.name | |
# 执行文本转语音 | |
gen_audio = pipeline.inference_ar_and_fm( | |
src_wav_path=None, | |
src_text=text, | |
style_ref_wav_path=ref_audio, # 直接使用路径 | |
timbre_ref_wav_path=ref_audio, | |
style_ref_wav_text=ref_text if ref_text else None, | |
src_text_language=src_language, | |
style_ref_wav_text_language=ref_language, | |
) | |
save_audio(gen_audio, output_path=output_path) | |
return output_path | |
def create_interface(): | |
app = VevoGradioApp() | |
with gr.Blocks(title="Vevo 语音转换演示") as demo: | |
gr.Markdown("# Vevo 语音转换模型演示") | |
gr.Markdown("Vevo是一个强大的语音转换模型,支持语音转换、风格转换、音色转换和文本转语音功能。") | |
with gr.Tab("语音转换"): | |
gr.Markdown("## 语音转换 (VevoVoice)") | |
gr.Markdown("将内容音频的内容转换为参考音频的风格和音色。") | |
with gr.Row(): | |
content_audio_voice = gr.Audio(label="内容音频", type="filepath") | |
reference_audio_voice = gr.Audio(label="参考音频", type="filepath") | |
voice_btn = gr.Button("转换") | |
voice_output = gr.Audio(label="转换结果") | |
voice_btn.click(fn=app.vevo_voice, inputs=[content_audio_voice, reference_audio_voice], outputs=voice_output) | |
with gr.Tab("风格转换"): | |
gr.Markdown("## 风格转换 (VevoStyle)") | |
gr.Markdown("将内容音频的风格转换为参考音频的风格,保留原始音色。") | |
with gr.Row(): | |
content_audio_style = gr.Audio(label="内容音频", type="filepath") | |
style_audio = gr.Audio(label="风格参考音频", type="filepath") | |
style_btn = gr.Button("转换") | |
style_output = gr.Audio(label="转换结果") | |
style_btn.click(fn=app.vevo_style, inputs=[content_audio_style, style_audio], outputs=style_output) | |
with gr.Tab("音色转换"): | |
gr.Markdown("## 音色转换 (VevoTimbre)") | |
gr.Markdown("将内容音频的音色转换为参考音频的音色,保留内容和风格。") | |
with gr.Row(): | |
content_audio_timbre = gr.Audio(label="内容音频", type="filepath") | |
reference_audio_timbre = gr.Audio(label="音色参考音频", type="filepath") | |
timbre_btn = gr.Button("转换") | |
timbre_output = gr.Audio(label="转换结果") | |
timbre_btn.click(fn=app.vevo_timbre, inputs=[content_audio_timbre, reference_audio_timbre], outputs=timbre_output) | |
with gr.Tab("文本转语音"): | |
gr.Markdown("## 文本转语音 (VevoTTS)") | |
gr.Markdown("将输入文本转换为语音,使用参考音频的风格和音色。") | |
text_input = gr.Textbox(label="输入文本", lines=3) | |
with gr.Row(): | |
ref_audio_tts = gr.Audio(label="参考音频", type="filepath") | |
src_language = gr.Dropdown(["en", "zh", "ja", "ko"], label="源文本语言", value="en") | |
with gr.Row(): | |
ref_language = gr.Dropdown(["en", "zh", "ja", "ko"], label="参考文本语言", value="en") | |
ref_text = gr.Textbox(label="参考文本(可选)", lines=2) | |
tts_btn = gr.Button("生成") | |
tts_output = gr.Audio(label="生成结果") | |
tts_btn.click(fn=app.vevo_tts, inputs=[text_input, ref_audio_tts, src_language, ref_language, ref_text], outputs=tts_output) | |
gr.Markdown("## 关于") | |
gr.Markdown("本演示基于 [Vevo模型](https://huggingface.co/amphion/Vevo),由[Amphion](https://github.com/open-mmlab/Amphion)开发。") | |
return demo | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch() | |