Spaces:

sagar007
/

shuka_audio

Sleeping

App Files Files Community

sagar007 commited on Aug 18, 2024

Commit

3bb5f02

verified ·

1 Parent(s): 5c15933

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -31

app.py CHANGED Viewed

@@ -1,43 +1,42 @@
-import gradio as gr
 import transformers
 import librosa
 import torch
 # Load the model pipeline
 pipe = transformers.pipeline(model='sarvamai/shuka_v1', trust_remote_code=True, device=0 if torch.cuda.is_available() else -1, torch_dtype=torch.bfloat16)
-def process_audio(audio_file, system_prompt, user_prompt):
-    # Load and preprocess the audio
-    audio, sr = librosa.load(audio_file.name, sr=16000)
-    # Prepare the conversation turns
-    turns = [
-        {'role': 'system', 'content': system_prompt},
-        {'role': 'user', 'content': f'<|audio|>{user_prompt}'}
-    ]
-    # Generate response
-    result = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
-    return result[0]['generated_text']
-# Create the Gradio interface
-iface = gr.Interface(
-    fn=process_audio,
-    inputs=[
-        gr.Audio(type="filepath", label="Upload Audio (Indic language)"),
-        gr.Textbox(label="System Prompt", value="Respond naturally and informatively."),
-        gr.Textbox(label="User Prompt (optional)", value="")
-    ],
-    outputs=gr.Textbox(label="Shuka v1 Response"),
-    title="Shuka v1 Demo: Multilingual Audio Understanding",
-    description="Upload an audio file in any Indic language, and Shuka v1 will process and respond to it.",
-    examples=[
-        ["path/to/hindi_sample.wav", "Respond naturally and informatively.", "What is the main topic of this audio?"],
-        ["path/to/tamil_sample.wav", "Translate the audio content to English.", ""],
-        ["path/to/bengali_sample.wav", "Summarize the key points discussed in the audio.", ""]
-    ]
-)
-# Launch the app
-iface.launch()

 import transformers
 import librosa
 import torch
+from tqdm import tqdm
 # Load the model pipeline
 pipe = transformers.pipeline(model='sarvamai/shuka_v1', trust_remote_code=True, device=0 if torch.cuda.is_available() else -1, torch_dtype=torch.bfloat16)
+def process_audio_batched(audio_file, system_prompt, user_prompt, batch_size=10, segment_length=10):  # segment_length in seconds
+    # Load audio
+    audio, sr = librosa.load(audio_file, sr=16000)
+    # Calculate number of samples per segment
+    samples_per_segment = segment_length * sr
+    # Split audio into segments
+    segments = [audio[i:i+samples_per_segment] for i in range(0, len(audio), samples_per_segment)]
+    results = []
+    # Process segments in batches
+    for i in tqdm(range(0, len(segments), batch_size)):
+        batch = segments[i:i+batch_size]
+        turns = [
+            {'role': 'system', 'content': system_prompt},
+            {'role': 'user', 'content': f'<|audio|>{user_prompt}'}
+        ]
+        batch_results = pipe([{'audio': seg, 'turns': turns, 'sampling_rate': sr} for seg in batch], max_new_tokens=512)
+        results.extend([result[0]['generated_text'] for result in batch_results])
+    # Combine results (this is a simple concatenation, you might want to implement a more sophisticated method)
+    return ' '.join(results)
+# Example usage
+audio_file = "path/to/your/audio/file.wav"
+system_prompt = "Transcribe the audio accurately."
+user_prompt = "What is being said in this audio?"
+full_result = process_audio_batched(audio_file, system_prompt, user_prompt)
+print(full_result)