Update app.py
Browse files
app.py
CHANGED
@@ -59,7 +59,27 @@ _ = utils.load_checkpoint("pretrained_ljs.pth", net_g, None)
|
|
59 |
st.title("VITS Text-to-Speech Demo")
|
60 |
|
61 |
# Input text box for user to enter text
|
62 |
-
text_input = st.text_input("Enter text to convert to speech", value="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
if st.button("Generate Speech"):
|
65 |
# Convert the text to the appropriate format (e.g., phoneme or character representation)
|
|
|
59 |
st.title("VITS Text-to-Speech Demo")
|
60 |
|
61 |
# Input text box for user to enter text
|
62 |
+
text_input = st.text_input("Enter text to convert to speech", value="Chào mừng các bạn đến với môn Xử lí tiếng nói")
|
63 |
+
|
64 |
+
##### A demo for the input text #####
|
65 |
+
# Convert the text to the appropriate format (e.g., phoneme or character representation)
|
66 |
+
stn_tst = get_text(text_input, hps)
|
67 |
+
|
68 |
+
with torch.no_grad():
|
69 |
+
x_tst = stn_tst.unsqueeze(0)
|
70 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
71 |
+
audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.float().numpy()
|
72 |
+
|
73 |
+
# Use hps.data.sampling_rate for playing the audio
|
74 |
+
st.text("Before Fine-tuned:")
|
75 |
+
st.audio(audio, format="audio/wav", sample_rate=hps.data.sampling_rate)
|
76 |
+
|
77 |
+
get_vi_audio(text_input)
|
78 |
+
|
79 |
+
st.text("After Fine-tuned:")
|
80 |
+
st.audio("vi_output.wav", format="audio/wav")
|
81 |
+
|
82 |
+
##### User's Inference #####
|
83 |
|
84 |
if st.button("Generate Speech"):
|
85 |
# Convert the text to the appropriate format (e.g., phoneme or character representation)
|