Spaces:

helvekami
/

ShukaNote

Running

File size: 2,345 Bytes

98333ca
e2f65f6
 
 
98333ca
e2f65f6
 
 
 
 
 
 
98333ca
e2f65f6
 
 
 
 
fbc6758
e2f65f6
fbc6758
 
 
 
 
 
 
 
 
e2f65f6
 
fbc6758
 
 
 
 
 
 
e2f65f6
 
 
 
 
fbc6758
 
 
 
 
 
e2f65f6
 
 
 
fbc6758
e2f65f6
 
fbc6758
 
e2f65f6
 
fbc6758
e2f65f6
 
fbc6758
e2f65f6
 
 
fbc6758

import gradio as gr
import transformers
import librosa
import torch

# Load the Shuka model pipeline.
pipe = transformers.pipeline(
    model="sarvamai/shuka_v1",
    trust_remote_code=True,
    device=0 if torch.cuda.is_available() else -1,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None
)

def process_audio(audio):
    """
    Processes the input audio and returns a text response generated by the Shuka model.
    """
    if audio is None:
        return "No audio provided. Please upload or record an audio file."

    try:
        # Gradio returns a tuple: (sample_rate, numpy_array)
        sample_rate, audio_data = audio
    except Exception as e:
        return f"Error processing audio input: {e}"
    
    if audio_data is None or len(audio_data) == 0:
        return "Audio data is empty. Please try again with a valid audio file."
    
    # Resample to 16000 Hz if necessary
    if sample_rate != 16000:
        try:
            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
            sample_rate = 16000
        except Exception as e:
            return f"Error during resampling: {e}"
    
    # Define conversation turns for the model
    turns = [
        {'role': 'system', 'content': 'Respond naturally and informatively.'},
        {'role': 'user', 'content': '<|audio|>'}
    ]
    
    try:
        result = pipe({'audio': audio_data, 'turns': turns, 'sampling_rate': sample_rate}, max_new_tokens=512)
    except Exception as e:
        return f"Error during model processing: {e}"
    
    # Extract generated text
    if isinstance(result, list) and len(result) > 0:
        response = result[0].get('generated_text', '')
    else:
        response = str(result)
    
    return response

# Create the Gradio interface.
# If you wish to record audio directly, you may need to upgrade Gradio to a version that supports "source" for the Audio component.
iface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="numpy"),  # using file upload input for audio
    outputs="text",
    title="Sarvam AI Shuka Voice Demo",
    description="Upload an audio file and get a response using Sarvam AI's Shuka model."
)

if __name__ == "__main__":
    # If port 7860 is in use, you can specify another port (here we use 7861)
    iface.launch(server_port=7861)