sagar007 commited on
Commit
416ddd3
·
verified ·
1 Parent(s): 68ce9a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -44
app.py CHANGED
@@ -1,61 +1,57 @@
 
1
  import librosa
2
  import torch
3
- from tqdm import tqdm
4
- import transformers
5
 
6
  # Check for GPU availability
7
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
  print(f"Using device: {device}")
9
 
10
  # Load the model pipeline
11
- pipe = transformers.pipeline(
12
- model='sarvamai/shuka_v1',
13
- trust_remote_code=True,
14
- device=device,
15
- torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
16
- )
 
 
 
 
 
 
 
 
 
17
 
18
- def process_audio_batched(audio_file, system_prompt, user_prompt, batch_size=4, segment_length=10):
19
- # Load audio
20
- audio, sr = librosa.load(audio_file, sr=16000)
21
-
22
- # Calculate number of samples per segment
23
- samples_per_segment = segment_length * sr
24
-
25
- # Split audio into segments
26
- segments = [audio[i:i+samples_per_segment] for i in range(0, len(audio), samples_per_segment)]
27
-
28
- full_result = []
29
-
30
- # Process segments in batches
31
- for i in tqdm(range(0, len(segments), batch_size)):
32
- batch = segments[i:i+batch_size]
33
 
34
  turns = [
35
  {'role': 'system', 'content': system_prompt},
36
  {'role': 'user', 'content': f'<|audio|>{user_prompt}'}
37
  ]
38
 
39
- # Move batch to GPU if available
40
- batch_gpu = [torch.tensor(seg, device=device) for seg in batch]
41
 
42
- batch_results = pipe([{'audio': seg, 'turns': turns, 'sampling_rate': sr} for seg in batch_gpu], max_new_tokens=512)
43
- full_result.extend([result[0]['generated_text'] for result in batch_results])
44
-
45
- # Clear GPU memory
46
- torch.cuda.empty_cache()
47
-
48
- # Combine results
49
- return ' '.join(full_result)
50
 
51
- # Example usage
52
- audio_file = "path/to/your/audio/file.wav"
53
- system_prompt = "Transcribe the audio accurately."
54
- user_prompt = "What is being said in this audio?"
 
 
 
 
 
 
 
 
55
 
56
- try:
57
- full_result = process_audio_batched(audio_file, system_prompt, user_prompt)
58
- print(full_result)
59
- except Exception as e:
60
- print(f"An error occurred: {str(e)}")
61
- # Additional error handling and logging can be added here
 
1
+ import gradio as gr
2
  import librosa
3
  import torch
4
+ from transformers import pipeline
 
5
 
6
  # Check for GPU availability
7
+ device = "cuda" if torch.cuda.is_available() else "cpu"
8
  print(f"Using device: {device}")
9
 
10
  # Load the model pipeline
11
+ try:
12
+ pipe = pipeline(
13
+ model='sarvamai/shuka_v1',
14
+ trust_remote_code=True,
15
+ device=device,
16
+ torch_dtype=torch.float16 if device == 'cuda' else torch.float32
17
+ )
18
+ print("Pipeline loaded successfully")
19
+ except Exception as e:
20
+ print(f"Error loading pipeline: {str(e)}")
21
+ pipe = None
22
+
23
+ def process_audio(audio_file, system_prompt, user_prompt):
24
+ if pipe is None:
25
+ return "Error: Model pipeline not initialized."
26
 
27
+ try:
28
+ # Load audio
29
+ audio, sr = librosa.load(audio_file, sr=16000)
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  turns = [
32
  {'role': 'system', 'content': system_prompt},
33
  {'role': 'user', 'content': f'<|audio|>{user_prompt}'}
34
  ]
35
 
36
+ # Process audio
37
+ result = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
38
 
39
+ return result[0]['generated_text']
40
+ except Exception as e:
41
+ return f"Error processing audio: {str(e)}"
 
 
 
 
 
42
 
43
+ # Define Gradio interface
44
+ iface = gr.Interface(
45
+ fn=process_audio,
46
+ inputs=[
47
+ gr.Audio(type="filepath", label="Upload Audio"),
48
+ gr.Textbox(label="System Prompt", default="Transcribe the audio accurately."),
49
+ gr.Textbox(label="User Prompt", default="What is being said in this audio?")
50
+ ],
51
+ outputs="text",
52
+ title="Audio Processing with Shuka_v1",
53
+ description="Upload an audio file and get the transcription or analysis based on your prompts."
54
+ )
55
 
56
+ # Launch the app
57
+ iface.launch()