import numpy as np
import soundfile
import msinference  # Prefer live_demo.py instead as this demo.py has no split to sentences to prevent OOM
from audiocraft.builders import AudioGen  # fixed bug for repeated calls

def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
              voice='en_US/m-ailabs_low#mary_ann', # Listen to voices https://huggingface.co/dkounadis/artificial-styletts2/discussions/1
              soundscape = 'birds fomig'):         # purposeful spells for AudioGen (behaves as controllable top-p)

    if ('en_US/' in voice) or ('en_UK/' in voice):

        style_vector = msinference.compute_style('assets/wavs/style_vector/' + voice.replace(
                                                '/', '_').replace('#', '_').replace(
                                                    'cmu-arctic', 'cmu_arctic').replace(
                                                        '_low', '') + '.wav')

        x = msinference.inference(text, style_vector)

    elif '_' in  voice:

        style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
                                                '/', '_').replace('#', '_').replace(
                                                    'cmu-arctic', 'cmu_arctic').replace(
                                                        '_low', '') + '.wav')

        x = msinference.inference(text, style_vector)

    else:

        x = msinference.foreign(text=text, lang=voice)

    x /= 1.02 * np.abs(x).max() + 1e-7  # volume amplify to [-1,1]
    if soundscape is not None:
        sound_gen = AudioGen().to('cuda:0').eval()
        background = sound_gen.generate(soundscape, duration=len(x)/16000 + .74,  # sound duration in seconds
                                              ).detach().cpu().numpy()
        x = .6 * x + .4 * background[:len(x)]
    return x

soundfile.write(f'demo.wav', tts_entry(), 16000)