File size: 9,239 Bytes
5b4c852
 
 
 
 
 
 
 
 
 
 
 
 
41b44b0
9060d5c
 
e7933a9
1cf3136
 
c9dee22
e7933a9
1cf3136
3c02e07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b4c852
 
 
 
 
 
 
371e3a0
5b4c852
 
 
 
 
cb71acf
 
dc5eeb8
b1f350e
dc5eeb8
5b4c852
 
 
 
 
4e4b6f0
 
da0a385
5b4c852
9060d5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b4c852
258fb54
 
 
 
 
 
 
 
 
9060d5c
 
 
 
 
 
b1f350e
9060d5c
 
 
 
 
 
5b4c852
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
667efb8
5b4c852
9060d5c
5b4c852
 
 
 
 
 
371e3a0
4e4b6f0
 
5b4c852
 
 
 
 
 
 
4e4b6f0
 
 
 
5b4c852
4e4b6f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b4c852
 
 
4e4b6f0
 
5b4c852
4e4b6f0
 
 
 
 
5b4c852
da0a385
5b4c852
 
da0a385
5b4c852
da0a385
f9ab841
5b4c852
 
 
4e4b6f0
5b4c852
 
 
d4d1fbd
 
5b4c852
 
50b66c0
5b4c852
 
87762e7
5b4c852
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import threading

import torch

os.system('nvidia-smi')
# os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
print(torch.backends.cudnn.version())

import importlib
import sys

dynamic_modules_file1 = '/home/user/.pyenv/versions/3.10.16/lib/python3.10/site-packages/diffusers/utils/dynamic_modules_utils.py'
dynamic_modules_file2 = '/usr/local/lib/python3.10/site-packages/diffusers/utils/dynamic_modules_utils.py'

def modify_dynamic_modules_file(dynamic_modules_file):
    if os.path.exists(dynamic_modules_file):
        with open(dynamic_modules_file, 'r') as file:
            lines = file.readlines()
        with open(dynamic_modules_file, 'w') as file:
            for line in lines:
                if "from huggingface_hub import cached_download" in line:
                    file.write("from huggingface_hub import hf_hub_download, model_info\n")
                else:
                    file.write(line)

modify_dynamic_modules_file(dynamic_modules_file1)
modify_dynamic_modules_file(dynamic_modules_file2)

import sys
import argparse
import gradio as gr
import numpy as np
import torchaudio
import random
import librosa
import spaces
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))

from huggingface_hub import snapshot_download
snapshot_download('FunAudioLLM/CosyVoice2-0.5B', local_dir='pretrained_models/CosyVoice2-0.5B')
snapshot_download('kemuriririn/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
snapshot_download('FunAudioLLM/SenseVoiceSmall', local_dir='pretrained_models/SenseVoiceSmall')
os.system('cd pretrained_models/CosyVoice-ttsfrd/ && pip install ttsfrd_dependency-0.1-py3-none-any.whl && pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl && unzip resource.zip -d .')

from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav, logging
from cosyvoice.utils.common import set_all_random_seed

inference_mode_list = ['3s Voice Clone']
instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button'}
stream_mode_list = [('No', False), ('Yes', True)]
max_val = 0.8
cosyvoice_instance = None
asr_model = None
cosyvoice_lock = threading.Lock()

@spaces.GPU
def get_cosyvoice():
    global cosyvoice_instance, model_dir
    load_jit = True if os.environ.get('jit') == '1' else False
    load_onnx = True if os.environ.get('onnx') == '1' else False
    load_trt = True if os.environ.get('trt') == '1' else False
    with cosyvoice_lock:
        if cosyvoice_instance is not None:
            return cosyvoice_instance
        else:
            logging.info('cosyvoice args load_jit {} load_onnx {} load_trt {}'.format(load_jit, load_onnx, load_trt))
            cosyvoice_instance= CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=load_jit, load_onnx=load_onnx,
                           load_trt=load_trt)
            return cosyvoice_instance

@spaces.GPU
def infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream, speed):
    cosyvoice = get_cosyvoice()
    if cosyvoice.frontend.instruct is True:
        logging.warning('CosyVoice2-0.5B does not support zero-shot inference, please use CosyVoice-300M or CosyVoice-300M-Instruct.')
        return
    for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
        yield i

def get_asr():
    global asr_model
    if asr_model is not None:
        return asr_model
    else:
        logging.info('asr model load')
        model_dir = "pretrained_models/SenseVoiceSmall"
        asr_model = AutoModel(
            model=model_dir,
            disable_update=True,
            log_level='DEBUG',
            device="cuda:0")
        return asr_model

def generate_seed():
    seed = random.randint(1, 100000000)
    return {
        "__type__": "update",
        "value": seed
    }


def postprocess(speech, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db,
        frame_length=win_length,
        hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
    return speech

@spaces.GPU
def prompt_wav_recognition(prompt_wav):
    res = get_asr().generate(input=prompt_wav,
                             language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
                             use_itn=True,
    )
    text = res[0]["text"].split('|>')[-1]
    return text

@spaces.GPU
def generate_audio(tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, stream):
    speed = 1.0
    if prompt_wav_upload is not None:
        prompt_wav = prompt_wav_upload
    elif prompt_wav_record is not None:
        prompt_wav = prompt_wav_record
    else:
        prompt_wav = None

    if prompt_text == '':
        gr.Warning('Empty prompt found, please check the prompt text.')
        yield (target_sr, default_data)
        return

    if prompt_wav is None:
        gr.Warning('Empty prompt found, please upload or record audio.')
        yield (target_sr, default_data)
        return

    info = torchaudio.info(prompt_wav)
    if info.num_frames / info.sample_rate > 10:
        gr.Warning('Please use prompt audio shorter than 10s.')
        yield (target_sr, default_data)
        return

    if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
        gr.Warning('Prompt wav sample rate {}, lower than {}.'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
        yield (target_sr, default_data)
        return

    prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
    set_all_random_seed(seed)
    for i in infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
        yield (target_sr, i['tts_speech'].numpy().flatten())

def main():
    with gr.Blocks() as demo:
        gr.Markdown("### 3s Voice Clone")
        gr.Markdown("#### Clone any voice with just 3 seconds of audio. Upload or record audio, input transcription, and click 'Speech Synthesis'.")

        tts_text = gr.Textbox(label="Text to synthesize", lines=1, value="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities.")
        with gr.Row():
            prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Prompt wav file (sample rate >= 16kHz)')
            prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record prompt from your microphone')
        prompt_text = gr.Textbox(label="Prompt Transcription", lines=1, placeholder="Prompt transcription (auto ASR, you can correct the recognition results)", value='')
        with gr.Row():
            stream = gr.Radio(choices=stream_mode_list, label='Streaming or not', value=stream_mode_list[0][1])
            with gr.Column(scale=0.25):
                seed_button = gr.Button(value="\U0001F3B2")
                seed = gr.Number(value=0, label="Random Seed")

        generate_button = gr.Button("Speech Synthesis")
        audio_output = gr.Audio(label="Audio Output", autoplay=True, streaming=False)

        seed_button.click(generate_seed, inputs=[], outputs=seed)
        generate_button.click(generate_audio,
                              inputs=[tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, stream],
                              outputs=[audio_output])
        prompt_wav_upload.change(fn=prompt_wav_recognition, inputs=[prompt_wav_upload], outputs=[prompt_text])
        prompt_wav_record.change(fn=prompt_wav_recognition, inputs=[prompt_wav_record], outputs=[prompt_text])
        
    demo.launch(max_threads=4)

if __name__ == '__main__':
    # sft_spk = cosyvoice.list_avaliable_spks()
    prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
    for stream in [True, False]:
        for i, j in enumerate(infer_zeroshot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=stream)):
            continue
    prompt_sr, target_sr = 16000, 24000
    default_data = np.zeros(target_sr)
    main()