|
import numpy as np |
|
import soundfile |
|
import msinference |
|
from audiocraft.builders import AudioGen |
|
|
|
def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.', |
|
voice='en_US/m-ailabs_low#mary_ann', |
|
soundscape = 'birds fomig'): |
|
|
|
if ('en_US/' in voice) or ('en_UK/' in voice): |
|
|
|
style_vector = msinference.compute_style('assets/wavs/style_vector/' + voice.replace( |
|
'/', '_').replace('#', '_').replace( |
|
'cmu-arctic', 'cmu_arctic').replace( |
|
'_low', '') + '.wav') |
|
|
|
x = msinference.inference(text, style_vector) |
|
|
|
elif '_' in voice: |
|
|
|
style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace( |
|
'/', '_').replace('#', '_').replace( |
|
'cmu-arctic', 'cmu_arctic').replace( |
|
'_low', '') + '.wav') |
|
|
|
x = msinference.inference(text, style_vector) |
|
|
|
else: |
|
|
|
x = msinference.foreign(text=text, lang=voice) |
|
|
|
x /= 1.02 * np.abs(x).max() + 1e-7 |
|
if soundscape is not None: |
|
sound_gen = AudioGen().to('cuda:0').eval() |
|
background = sound_gen.generate(soundscape, duration=len(x)/16000 + .74, |
|
).detach().cpu().numpy() |
|
x = .6 * x + .4 * background[:len(x)] |
|
return x |
|
|
|
soundfile.write(f'demo.wav', tts_entry(), 16000) |
|
|