File size: 3,905 Bytes
c9d400e
 
 
 
 
 
 
 
 
 
981f00a
 
 
 
c9d400e
 
 
981f00a
 
 
 
 
 
 
c9d400e
981f00a
c9d400e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
981f00a
c9d400e
981f00a
c9d400e
 
981f00a
c9d400e
 
 
981f00a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9d400e
 
 
 
 
 
 
981f00a
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import opensmile
import joblib
import wave
import datetime
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

from base64 import b64decode

import onnx
import onnxruntime
import torch

import gradio as gr


model_names = ["DNN", "RandomForest"]

rf_model_path = "RF_emobase_20_model_top1_score0.6863_20231207_1537.joblib"
dnn_model_path = "NN_emobase_allfeature_model_score_69.00_20240304_1432.onnx"

dnn_model = onnxruntime.InferenceSession(dnn_model_path)
rf_model = joblib.load(rf_model_path)

def extract_features_rf(audio_path):
    smile = opensmile.Smile(
    #feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_set=opensmile.FeatureSet.emobase,
    feature_level=opensmile.FeatureLevel.Functionals,
    )
    feature_df = smile.process_files(audio_path)
    output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1']
    df = pd.DataFrame(feature_df.values[0], index=feature_df.columns)
    df = df[df.index.isin(output_features)]
    df = df.T
    scaler = StandardScaler()
    feature = scaler.fit_transform(df)
    print(df.shape)

    return feature

def predict_rf(input):
  # openSMILEで特徴量抽出
  feature_vector = extract_features_rf([input])

  # ロードしたモデルで推論
  prediction = rf_model.predict(feature_vector)
  #print(f"Prediction: {prediction}")
  return prediction






def extract_features_dnn(audio_path):
    smile = opensmile.Smile(
    #feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_set=opensmile.FeatureSet.emobase,
    feature_level=opensmile.FeatureLevel.Functionals,
    )
    feature_df = smile.process_files(audio_path)
    #output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1']
    df = pd.DataFrame(feature_df.values[0], index=feature_df.columns)
    #df = df[df.index.isin(output_features)]
    df = df.T
    scaler = StandardScaler()
    feature = scaler.fit_transform(df)
    print(df.shape)

    return feature

def softmax_calc_(pred):
    if torch.argmax(pred) == torch.tensor(0) :
        prediction = "question"

    else:
        prediction = "declarative"
    return prediction

def predict_dnn(input):
    # openSMILEで特徴量抽出
    feature_vector = extract_features_dnn([input])
    
    # ロードしたモデルで推論
    onnx_outs = dnn_model.run(None, {"model_input":feature_vector})
    print(onnx_outs)
    prediction = softmax_calc_(torch.FloatTensor(onnx_outs))
    print(f"Prediction: {prediction}")
    return prediction

def main(model):
    if model == "DNN":
        return predict_dnn(input)
    elif model == "RandomForest":
        return predict_rf(input)


with gr.Blocks() as demo:
    model = gr.Dropdown(choices=model_names),
    fn = main,
    inputs=[
        gr.Audio(sources=["microphone","upload"], type="filepath")
    ],
    outputs=[
        "textbox"
    ],
    live=True,
    description="demo for Audio to question classifier"
    

demo.launch()