import numpy as np import soundfile import msinference # Prefer live_demo.py instead as this demo.py has no split to sentences to prevent OOM from audiocraft.builders import AudioGen # fixed bug for repeated calls def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.', voice='en_US/m-ailabs_low#mary_ann', # Listen to voices https://huggingface.co/dkounadis/artificial-styletts2/discussions/1 soundscape = 'birds fomig'): # purposeful spells for AudioGen (behaves as controllable top-p) if ('en_US/' in voice) or ('en_UK/' in voice): style_vector = msinference.compute_style('assets/wavs/style_vector/' + voice.replace( '/', '_').replace('#', '_').replace( 'cmu-arctic', 'cmu_arctic').replace( '_low', '') + '.wav') x = msinference.inference(text, style_vector) elif '_' in voice: style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace( '/', '_').replace('#', '_').replace( 'cmu-arctic', 'cmu_arctic').replace( '_low', '') + '.wav') x = msinference.inference(text, style_vector) else: x = msinference.foreign(text=text, lang=voice) x /= 1.02 * np.abs(x).max() + 1e-7 # volume amplify to [-1,1] if soundscape is not None: sound_gen = AudioGen().to('cuda:0').eval() background = sound_gen.generate(soundscape, duration=len(x)/16000 + .74, # sound duration in seconds ).detach().cpu().numpy() x = .6 * x + .4 * background[:len(x)] return x soundfile.write(f'demo.wav', tts_entry(), 16000)