sagar007 commited on
Commit
3bb5f02
·
verified ·
1 Parent(s): 5c15933

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -31
app.py CHANGED
@@ -1,43 +1,42 @@
1
- import gradio as gr
2
  import transformers
3
  import librosa
4
  import torch
 
5
 
6
  # Load the model pipeline
7
  pipe = transformers.pipeline(model='sarvamai/shuka_v1', trust_remote_code=True, device=0 if torch.cuda.is_available() else -1, torch_dtype=torch.bfloat16)
8
 
9
- def process_audio(audio_file, system_prompt, user_prompt):
10
- # Load and preprocess the audio
11
- audio, sr = librosa.load(audio_file.name, sr=16000)
12
 
13
- # Prepare the conversation turns
14
- turns = [
15
- {'role': 'system', 'content': system_prompt},
16
- {'role': 'user', 'content': f'<|audio|>{user_prompt}'}
17
- ]
18
 
19
- # Generate response
20
- result = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
21
 
22
- return result[0]['generated_text']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- # Create the Gradio interface
25
- iface = gr.Interface(
26
- fn=process_audio,
27
- inputs=[
28
- gr.Audio(type="filepath", label="Upload Audio (Indic language)"),
29
- gr.Textbox(label="System Prompt", value="Respond naturally and informatively."),
30
- gr.Textbox(label="User Prompt (optional)", value="")
31
- ],
32
- outputs=gr.Textbox(label="Shuka v1 Response"),
33
- title="Shuka v1 Demo: Multilingual Audio Understanding",
34
- description="Upload an audio file in any Indic language, and Shuka v1 will process and respond to it.",
35
- examples=[
36
- ["path/to/hindi_sample.wav", "Respond naturally and informatively.", "What is the main topic of this audio?"],
37
- ["path/to/tamil_sample.wav", "Translate the audio content to English.", ""],
38
- ["path/to/bengali_sample.wav", "Summarize the key points discussed in the audio.", ""]
39
- ]
40
- )
41
 
42
- # Launch the app
43
- iface.launch()
 
 
1
  import transformers
2
  import librosa
3
  import torch
4
+ from tqdm import tqdm
5
 
6
  # Load the model pipeline
7
  pipe = transformers.pipeline(model='sarvamai/shuka_v1', trust_remote_code=True, device=0 if torch.cuda.is_available() else -1, torch_dtype=torch.bfloat16)
8
 
9
+ def process_audio_batched(audio_file, system_prompt, user_prompt, batch_size=10, segment_length=10): # segment_length in seconds
10
+ # Load audio
11
+ audio, sr = librosa.load(audio_file, sr=16000)
12
 
13
+ # Calculate number of samples per segment
14
+ samples_per_segment = segment_length * sr
 
 
 
15
 
16
+ # Split audio into segments
17
+ segments = [audio[i:i+samples_per_segment] for i in range(0, len(audio), samples_per_segment)]
18
 
19
+ results = []
20
+
21
+ # Process segments in batches
22
+ for i in tqdm(range(0, len(segments), batch_size)):
23
+ batch = segments[i:i+batch_size]
24
+
25
+ turns = [
26
+ {'role': 'system', 'content': system_prompt},
27
+ {'role': 'user', 'content': f'<|audio|>{user_prompt}'}
28
+ ]
29
+
30
+ batch_results = pipe([{'audio': seg, 'turns': turns, 'sampling_rate': sr} for seg in batch], max_new_tokens=512)
31
+ results.extend([result[0]['generated_text'] for result in batch_results])
32
+
33
+ # Combine results (this is a simple concatenation, you might want to implement a more sophisticated method)
34
+ return ' '.join(results)
35
 
36
+ # Example usage
37
+ audio_file = "path/to/your/audio/file.wav"
38
+ system_prompt = "Transcribe the audio accurately."
39
+ user_prompt = "What is being said in this audio?"
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ full_result = process_audio_batched(audio_file, system_prompt, user_prompt)
42
+ print(full_result)