import transformers import librosa from transformers import pipeline from gtts import gTTS import os import gradio as gr import torch import spaces # Function to safely load pipeline def load_pipeline(model_name, **kwargs): try: return transformers.pipeline(model=model_name, **kwargs) except Exception as e: print(f"Error loading {model_name} pipeline: {e}") return None # Load Shuka v1 for speech recognition @spaces.GPU def load_shuka(): try: return load_pipeline('sarvamai/shuka_v1', trust_remote_code=True, torch_dtype=torch.float32) except Exception as e: print(f"Error loading Shuka v1: {e}") return None # Load sarvam-2b for text generation @spaces.GPU def load_sarvam(): try: return load_pipeline('sarvamai/sarvam-2b-v0.5') except Exception as e: print(f"Error loading sarvam-2b: {e}") return None # Attempt to load models shuka_pipe = load_shuka() sarvam_pipe = load_sarvam() def process_audio_input(audio): if shuka_pipe is None: return "Error: Shuka v1 model is not available. Please type your message instead." try: audio, sr = librosa.load(audio, sr=16000) turns = [ {'role': 'system', 'content': 'Respond naturally and informatively.'}, {'role': 'user', 'content': '<|audio|>'} ] result = shuka_pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512) return result[0]['generated_text'] except Exception as e: return f"Error processing audio: {str(e)}. Please type your message instead." def generate_response(text_input): if sarvam_pipe is None: return "Error: sarvam-2b model is not available. The assistant cannot generate responses at this time." try: response = sarvam_pipe(text_input, max_new_tokens=100, temperature=0.7, repetition_penalty=1.2)[0]['generated_text'] return response except Exception as e: return f"Error generating response: {str(e)}" def text_to_speech(text, lang='hi'): try: tts = gTTS(text=text, lang=lang) tts.save("response.mp3") return "response.mp3" except Exception as e: print(f"Error in text-to-speech: {str(e)}") return None def detect_language(text): lang_codes = { 'bn': 'Bengali', 'gu': 'Gujarati', 'hi': 'Hindi', 'kn': 'Kannada', 'ml': 'Malayalam', 'mr': 'Marathi', 'or': 'Oriya', 'pa': 'Punjabi', 'ta': 'Tamil', 'te': 'Telugu', 'en': 'English' } for code, lang in lang_codes.items(): if any(word in text for word in ['नमस्ते', 'હેલો', 'ನಮಸ್ಕಾರ', 'ഹലോ', 'नमस्कार', 'ਸਤ ਸ੍ਰੀ ਅਕਾਲ', 'வணக்கம்', 'నమస్కారం']): return 'hi' # Default to Hindi for simplicity return 'en' # Default to English if no Indic script is detected def indic_language_assistant(input_type, audio_input, text_input): try: if input_type == "audio" and audio_input is not None: transcription = process_audio_input(audio_input) elif input_type == "text" and text_input: transcription = text_input else: return "Please provide either audio or text input.", "No input provided.", None response = generate_response(transcription) lang = detect_language(response) audio_response = text_to_speech(response, lang) return transcription, response, audio_response except Exception as e: error_message = f"An error occurred: {str(e)}" return error_message, error_message, None # Create Gradio interface iface = gr.Interface( fn=indic_language_assistant, inputs=[ gr.Radio(["audio", "text"], label="Input Type", value="audio"), gr.Audio(source="microphone", type="filepath", label="Speak (if audio input selected)"), gr.Textbox(label="Type your message (if text input selected)") ], outputs=[ gr.Textbox(label="Transcription/Input"), gr.Textbox(label="Generated Response"), gr.Audio(label="Audio Response") ], title="Indic Language Virtual Assistant", description="Speak or type in any supported Indic language or English. The assistant will respond in text and audio." ) # Launch the app iface.launch()