JASRv1.1

Running on Zero

File size: 2,503 Bytes

8e73cee
 
 
 
 
f9a373a
8e73cee
 
 
 
 
 
 
 
f9a373a
8e73cee
f9a373a
8e73cee
 
 
 
 
 
 
 
 
3acdbb1
 
 
 
 
 
 
 
 
 
 
 
 
 
8e73cee
11ad5cc
 
 
3121b1f
4b9672f
11ad5cc
 
 
 
 
 
 
 
3b558b0
 
4b9672f
2802456
11ad5cc
 
 
 
 
 
 
 
 
 
8e73cee

import gradio as gr
from transformers import pipeline
import numpy as np
import os
from huggingface_hub import login
import spaces

# Get token from Space secrets
HF_TOKEN = os.environ.get("HF_TOKEN")
if HF_TOKEN:
    login(token=HF_TOKEN)

# Load model from your private repo
MODEL_ID = "badrex/JASR"  # Change this to match your repo!
transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID)

@spaces.GPU
def transcribe(audio):
    sr, y = audio
    # Convert to mono if stereo
    if y.ndim > 1:
        y = y.mean(axis=1)
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    return transcriber({"sampling_rate": sr, "raw": y})["text"]


# Manually prepare example file paths without metadata
examples = []
examples_dir = "examples"
if os.path.exists(examples_dir):
    for filename in os.listdir(examples_dir):
        if filename.endswith((".wav", ".mp3", ".ogg")):
            examples.append([os.path.join(examples_dir, filename)])
    
    print(f"Found {len(examples)} example files")
else:
    print("Examples directory not found")

    
demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(),
    outputs="text",
    theme="huggingface",
    title="JASR 🐐 Dialectal Arabic Speech Recognition",
    description="""
        <div class="centered-content">
            <div>
                <p>
                By <a href="https://badrex.github.io/" style="color: #2563eb;">Badr al-Absi</a> with ❤️🤍💚 
                </p>
                <br>
                <p style="font-size: 15px; line-height: 1.8;">
                Marhaba 👋🏼
                <br>
                <br>
                 This is a demo for JASR, pronounced <i>Jasir</i>, an automatic speech recognition system optimized for the regional dialects of <i>Jazirat al-Arab</i>, or the Arabian Peninsula. The model is a fine-tune of the speech foundation model <a https://huggingface.co/facebook/w2v-bert-2.0" style="color: #FF5349;">w2v-BERT 2.0</a>, a 580M pre-trained speech encoder. 
                <br>                   
                <p style="font-size: 15px; line-height: 1.8;">
                Simply <strong>upload an audio file</strong> 📤 or <strong>record yourself speaking</strong> 🎙️⏺️ to try out the model!
                </p>
            </div>
        </div>
        """,
    examples=examples if examples else None,
    cache_examples=False,  # Disable caching to avoid issues
    flagging_mode=None,
)

if __name__ == "__main__":
    demo.launch()