text_to_speach / app.py
Beehzod's picture
Update app.py
7bfb172 verified
raw
history blame
3.49 kB
import streamlit as st
from transformers import SeamlessM4Tv2Model, AutoProcessor
import torch
import numpy as np
from scipy.io.wavfile import write
import re
from io import BytesIO
# Load the processor and model
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Number to words function for Uzbek
number_words = {
0: "nol", 1: "bir", 2: "ikki", 3: "uch", 4: "to'rt", 5: "besh", 6: "olti", 7: "yetti", 8: "sakkiz", 9: "to'qqiz",
10: "o'n", 11: "o'n bir", 12: "o'n ikki", 13: "o'n uch", 14: "o'n to'rt", 15: "o'n besh", 16: "o'n oltı", 17: "o'n yetti",
18: "o'n sakkiz", 19: "o'n toqqiz", 20: "yigirma", 30: "o'ttiz", 40: "qirq", 50: "ellik", 60: "oltmish", 70: "yetmish",
80: "sakson", 90: "to'qson", 100: "yuz", 1000: "ming", 1000000: "million"
}
def number_to_words(number):
if number < 20:
return number_words[number]
elif number < 100:
tens, unit = divmod(number, 10)
return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
elif number < 1000:
hundreds, remainder = divmod(number, 100)
return (number_words[hundreds] + " yuz" if hundreds > 1 else "yuz") + (" " + number_to_words(remainder) if remainder else "")
elif number < 1000000:
thousands, remainder = divmod(number, 1000)
return (number_to_words(thousands) + " ming" if thousands > 1 else "ming") + (" " + number_to_words(remainder) if remainder else "")
elif number < 1000000000:
millions, remainder = divmod(number, 1000000)
return number_to_words(millions) + " million" + (" " + number_to_words(remainder) if remainder else "")
elif number < 1000000000000:
billions, remainder = divmod(number, 1000000000)
return number_to_words(billions) + " milliard" + (" " + number_to_words(remainder) if remainder else "")
else:
return str(number)
def replace_numbers_with_words(text):
def replace(match):
number = int(match.group())
return number_to_words(number)
result = re.sub(r'\b\d+\b', replace, text)
return result
# Replacements
replacements = [
("bo‘ladi", "bo'ladi"),
("yog‘ingarchilik", "yog'ingarchilik"),
]
def cleanup_text(text):
for src, dst in replacements:
text = text.replace(src, dst)
return text
# Streamlit App
st.title("Text-to-Speech using Seamless M4T Model")
# User Input
user_input = st.text_area("Enter the text for speech generation", height=200)
# Process the text and generate speech
if st.button("Generate Speech"):
if user_input.strip():
# Apply text transformations
converted_text = replace_numbers_with_words(user_input)
cleaned_text = cleanup_text(converted_text)
# Process input for model
inputs = processor(text=cleaned_text, src_lang="uzn", return_tensors="pt").to(device)
# Generate audio from text
audio_array_from_text = model.generate(**inputs, tgt_lang="uzn")[0].cpu().numpy().squeeze()
# Save to BytesIO
audio_io = BytesIO()
write(audio_io, 16000, audio_array_from_text.astype(np.float32))
audio_io.seek(0)
# Provide audio for playback
st.audio(audio_io, format='audio/wav')
else:
st.warning("Please enter some text to generate speech.")