File size: 2,608 Bytes
8f0ca83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# app.py
import gradio as gr
import pandas as pd
import numpy as np
import librosa
import joblib
import tensorflow as tf
from keras.models import load_model
from transformers import AutoTokenizer, TFAutoModel

# ====================
# 1. Load Model and Assets
# ====================
model = load_model("raga_predictor_model.h5")
scaler = joblib.load("scaler.pkl")
encoder = joblib.load("label_encoder.pkl")

# Load tokenizer and BERT model directly from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBERTv2-MLM-only")
bert_model = TFAutoModel.from_pretrained("ai4bharat/IndicBERTv2-MLM-only")

# Load metadata
meta = pd.read_csv("raga_metadata.csv")
raga_descriptions = dict(zip(meta['raga'], meta['description']))

# ====================
# 2. Define Utility Functions
# ====================
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=22050)
    features = {
        "chroma_stft": np.mean(librosa.feature.chroma_stft(y=y, sr=sr)),
        "spec_cent": np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)),
    }
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=18)
    for i in range(18):
        features[f"mfcc{i+1}"] = np.mean(mfccs[i])
    return pd.DataFrame([features])

def tokenize_description(description_text):
    desc_tok = tokenizer(description_text, padding=True, truncation=True, max_length=64, return_tensors="tf")
    desc_embed = bert_model(desc_tok['input_ids'], attention_mask=desc_tok['attention_mask'])[0][:, 0, :]
    return desc_embed

def predict_raga(audio_file):
    # Extract features
    audio_df = extract_features(audio_file.name)
    audio_scaled = scaler.transform(audio_df)
    audio_lstm_input = audio_scaled.reshape((1, 1, audio_scaled.shape[1]))

    # Use a dummy description
    description_text = ""

    # Tokenize dummy description
    desc_embed = tokenize_description([description_text])

    # Predict
    pred = model.predict([audio_lstm_input, desc_embed])
    raga_pred = encoder.inverse_transform([np.argmax(pred)])[0]

    # Get description
    description = raga_descriptions.get(raga_pred, "No description available.")

    return f"🎵 Predicted Raga: {raga_pred}\n\n📝 Description:\n{description}"

# ====================
# 3. Gradio Interface
# ====================
title = "🎶 Raga Prediction App"
description = "Upload an Indian classical music clip, and I will predict the Raga for you!"

interface = gr.Interface(
    fn=predict_raga,
    inputs=gr.Audio(type="file", label="Upload Audio File"),
    outputs="text",
    title=title,
    description=description,
)

interface.launch()