File size: 3,771 Bytes
856e338
 
 
 
 
f95d18e
 
 
856e338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f95d18e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
856e338
f95d18e
 
 
 
856e338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f95d18e
856e338
f95d18e
 
856e338
f95d18e
 
 
 
 
856e338
f95d18e
 
856e338
f95d18e
 
856e338
f95d18e
 
 
 
856e338
f95d18e
 
 
856e338
f95d18e
 
 
856e338
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import streamlit as st
import speech_recognition as sr
from transformers import MarianMTModel, MarianTokenizer
from gtts import gTTS
from io import BytesIO
import queue
import threading
import pyaudio

def load_model(source_lang, target_lang):
    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    try:
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        return tokenizer, model
    except Exception as e:
        st.error(f"Failed to load model for {source_lang} to {target_lang}. Ensure the language pair is supported. Error: {e}")
        return None, None

def translate_text(tokenizer, model, text):
    if not text:
        return ""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

def text_to_audio(text, lang):
    tts = gTTS(text=text, lang=lang)
    audio_file = BytesIO()
    tts.write_to_fp(audio_file)
    audio_file.seek(0)
    return audio_file

def recognize_speech_live(q):
    recognizer = sr.Recognizer()
    mic = sr.Microphone()
    
    with mic as source:
        recognizer.adjust_for_ambient_noise(source)
        st.info("Start speaking...")
        while True:
            try:
                audio_data = recognizer.listen(source)
                text = recognizer.recognize_google(audio_data)
                q.put(text)
            except sr.UnknownValueError:
                q.put("[Unintelligible]")
            except Exception as e:
                st.error(f"Error during speech recognition: {e}")
                break

def main():
    st.title("Real-Time Audio Language Translation")
    st.write("Translate spoken words in real time using open-source models.")
    
    # Language selection
    languages = {
        "English": "en",
        "Spanish": "es",
        "French": "fr",
        "German": "de",
        "Italian": "it",
        "Russian": "ru",
        "Chinese": "zh",
        "Japanese": "ja",
        "Korean": "ko",
    }

    source_language = st.selectbox("Select source language:", options=list(languages.keys()))
    target_language = st.selectbox("Select target language:", options=list(languages.keys()))

    if source_language == target_language:
        st.warning("Source and target languages must be different.")
        return

    source_lang_code = languages[source_language]
    target_lang_code = languages[target_language]

    # Load the model
    tokenizer, model = load_model(source_lang_code, target_lang_code)
    if not (tokenizer and model):
        return

    # Real-time speech recognition
    q = queue.Queue()
    transcription_placeholder = st.empty()
    translation_placeholder = st.empty()
    audio_placeholder = st.empty()

    if st.button("Start Real-Time Translation"):
        st.write("Processing...")

        # Start speech recognition in a separate thread
        threading.Thread(target=recognize_speech_live, args=(q,), daemon=True).start()

        while True:
            if not q.empty():
                spoken_text = q.get()
                transcription_placeholder.text_area("Transcribed Text:", spoken_text, height=100)

                # Translate text
                translated_text = translate_text(tokenizer, model, spoken_text)
                translation_placeholder.text_area("Translated Text:", translated_text, height=100)

                # Generate and play translated audio
                translated_audio = text_to_audio(translated_text, target_lang_code)
                audio_placeholder.audio(translated_audio, format="audio/mp3")

if __name__ == "__main__":
    main()