File size: 4,963 Bytes
6ec981e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
import os
from groq import Groq
import tempfile

def validate_file(file):
    """Validate uploaded file type and size."""
    if file is None:
        return False, "No file uploaded"
    
    # Check file size (25MB limit)
    file_size_mb = os.path.getsize(file.name) / (1024 * 1024)
    if file_size_mb > 25:
        return False, f"File size ({file_size_mb:.1f}MB) exceeds 25MB limit"
    
    # Check file extension
    valid_extensions = ['.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.wav', '.webm', '.flac', '.ogg', '.aac']
    file_extension = os.path.splitext(file.name)[1].lower()
    
    if file_extension not in valid_extensions:
        return False, f"Invalid file type. Supported formats: {', '.join(valid_extensions)}"
    
    return True, "File is valid"

def transcribe_audio(audio_file, api_key):
    """Transcribe audio/video files into text using Groq's Whisper model.
    
    This tool converts spoken content from audio and video files into written text.
    It supports multiple audio formats and handles files up to 25MB in size.
    
    Parameters:
        audio_file: An audio or video file to transcribe. 
                   Supported formats: MP3, MP4, MPEG, MPGA, M4A, WAV, WebM, FLAC, OGG, AAC.
                   Maximum size: 25MB.
        api_key: Your Groq API key, required for authentication.
                You can obtain this from https://console.groq.com/
    
    Returns:
        A text transcript of the spoken content in the audio file.
        
    Example:
        Upload a podcast episode to get a complete text transcript.
    """
    try:
        # Validate inputs
        if not api_key:
            return "Error: Please provide your Groq API key"
        
        if audio_file is None:
            return "Error: Please upload an audio or video file"
        
        # Validate file
        is_valid, message = validate_file(audio_file)
        if not is_valid:
            return f"Error: {message}"
        
        # Initialize Groq client
        client = Groq(api_key=api_key)
        
        # Read the audio file
        with open(audio_file.name, "rb") as file:
            # Create transcription
            transcription = client.audio.transcriptions.create(
                file=(os.path.basename(audio_file.name), file.read()),
                model="whisper-large-v3-turbo"
            )
        
        return transcription.text
        
    except Exception as e:
        return f"Error: {str(e)}"

# Create the Gradio interface with custom layout
with gr.Blocks(title="Audio/Video Transcription with Groq", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎡 Audio/Video Transcription with Groq Whisper")
    gr.Markdown("Upload an audio or video file and get an AI-generated transcript using Groq's Whisper model.")
    
    with gr.Row():
        # Left column - Input controls
        with gr.Column(scale=1):
            gr.Markdown("### πŸ“€ Upload & Settings")
            
            audio_input = gr.File(
                label="Upload Audio/Video File",
                file_types=[".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".flac", ".ogg", ".aac"],
                file_count="single"
            )
            
            api_key_input = gr.Textbox(
                label="Groq API Key",
                placeholder="Enter your Groq API key here...",
                type="password",
                lines=1
            )
            
            transcribe_btn = gr.Button(
                "🎯 Transcribe Audio",
                variant="primary",
                size="lg"
            )
            
            gr.Markdown("### ℹ️ File Requirements")
            gr.Markdown("""
            - **Max file size**: 25MB
            - **Supported formats**: MP3, MP4, MPEG, MPGA, M4A, WAV, WebM, FLAC, OGG, AAC
            - **Get API key**: [Groq Console](https://console.groq.com/)
            """)
        
        # Right column - Output
        with gr.Column(scale=1):
            gr.Markdown("### πŸ“ Transcript")
            
            transcript_output = gr.Textbox(
                label="Generated Transcript",
                placeholder="Your transcript will appear here...",
                lines=20,
                max_lines=30,
                show_copy_button=True,
                interactive=False
            )
    
    # Connect the button to the transcription function
    transcribe_btn.click(
        fn=transcribe_audio,
        inputs=[audio_input, api_key_input],
        outputs=transcript_output,
        show_progress=True
    )
    
    # Add examples section
    gr.Markdown("### πŸ”— Useful Links")
    gr.Markdown("""
    - [Get your Groq API key](https://console.groq.com/)
    - [Groq Documentation](https://console.groq.com/docs)
    - [Supported audio formats](https://platform.openai.com/docs/guides/speech-to-text)
    """)

if __name__ == "__main__":
    demo.launch(mcp_server=True)