File size: 8,830 Bytes
960b1a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92da7ef
 
960b1a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2Model,
    Wav2Vec2PreTrainedModel,
)
from torch.nn.functional import silu
from torch.nn.functional import softplus
from einops import rearrange, einsum
from torch import Tensor
from einops import rearrange

# DEVICE = torch.device('cuda')
DEVICE = torch.device('cpu')

## Audio models

class CustomMambaBlock(nn.Module):
    def __init__(self, d_input, d_model, dropout=0.1):
        super().__init__()
        self.in_proj = nn.Linear(d_input, d_model)
        self.s_B = nn.Linear(d_model, d_model)
        self.s_C = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_input)
        self.norm = nn.LayerNorm(d_input)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.ReLU()

    def forward(self, x):
        x_in = x  # сохраняем вход
        x = self.in_proj(x)
        B = self.s_B(x)
        C = self.s_C(x)
        x = x + B + C
        x = self.activation(x)
        x = self.out_proj(x)
        x = self.dropout(x)
        x = self.norm(x + x_in)  # residual + norm
        return x
    
class CustomMambaClassifier(nn.Module):
    def __init__(self, input_size=1024, d_model=256, num_layers=2, num_classes=7, dropout=0.1):
        super().__init__()
        self.input_proj = nn.Linear(input_size, d_model)
        self.blocks = nn.ModuleList([
            CustomMambaBlock(d_model, d_model, dropout=dropout)
            for _ in range(num_layers)
        ])
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x, lengths, with_features=False):
        # x: (batch, seq_length, input_size)
        x = self.input_proj(x)
        for block in self.blocks:
            x = block(x)
        pooled = []
        for i, l in enumerate(lengths):
            if l > 0:
                pooled.append(x[i, :l, :].mean(dim=0))
            else:
                pooled.append(torch.zeros(x.size(2), device=x.device))
        pooled = torch.stack(pooled, dim=0)
        if with_features:
            return self.fc(pooled), x
        else:
            return self.fc(pooled)

def get_model_mamba(params):
    return CustomMambaClassifier(
        input_size=params.get("input_size", 1024),
        d_model=params.get("d_model", 256),
        num_layers=params.get("num_layers", 2),
        num_classes=params.get("num_classes", 7),
        dropout=params.get("dropout", 0.1)
    )

class EmotionModel(Wav2Vec2PreTrainedModel):

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.init_weights()

    def forward(self, input_values):
        outputs = self.wav2vec2(input_values)
        hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
        return hidden_states
    
## Text models

class Embedding():
    def __init__(self, model_name='jinaai/jina-embeddings-v3', pooling=None):
        self.model_name = model_name
        self.pooling = pooling
        self.device = DEVICE
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, code_revision='da863dd04a4e5dce6814c6625adfba87b83838aa', trust_remote_code=True)
        self.model = AutoModel.from_pretrained(model_name, code_revision='da863dd04a4e5dce6814c6625adfba87b83838aa', trust_remote_code=True).to(self.device)
        self.model.eval()

    def _mean_pooling(self, X):
        def mean_pooling(model_output, attention_mask):
            token_embeddings = model_output[0]
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        encoded_input = self.tokenizer(X, padding=True, truncation=True, return_tensors='pt').to(self.device)
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings.unsqueeze(1)
    
    def get_embeddings(self, X, max_len):
        encoded_input = self.tokenizer(X, padding=True, truncation=True, return_tensors='pt').to(self.device)
        with torch.no_grad():
            features = self.model(**encoded_input)[0].detach().cpu().float().numpy()
        res = np.pad(features[:, :max_len, :], ((0, 0), (0, max(0, max_len - features.shape[1])), (0, 0)), "constant")
        return torch.tensor(res)
        
class RMSNorm(nn.Module):
    def __init__(self, d_model: int, eps: float = 1e-8) -> None:
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(d_model))

    def forward(self, x: Tensor) -> Tensor:        
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim = True) + self.eps) * self.weight

class Mamba(nn.Module):
    def __init__(self, num_layers, d_input, d_model, d_state=16, d_discr=None, ker_size=4, num_classes=7, max_tokens=95, model_name='jina', pooling=None):
        super().__init__()
        mamba_par = {
            'd_input' : d_input,
            'd_model' : d_model,
            'd_state' : d_state,
            'd_discr' : d_discr,
            'ker_size': ker_size
        }
        self.model_name = model_name
        self.max_tokens = max_tokens
        embed = Embedding(model_name, pooling)
        self.embedding = embed.get_embeddings
        self.layers = nn.ModuleList([nn.ModuleList([MambaBlock(**mamba_par), RMSNorm(d_input)]) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_input, num_classes)
        self.device = DEVICE

    def forward(self, seq, cache=None, with_features=True):
        seq = self.embedding(seq, self.max_tokens).to(self.device)
        for mamba, norm in self.layers:
            out, cache = mamba(norm(seq), cache)
            seq = out + seq
        if with_features:
            return self.fc_out(seq.mean(dim = 1)), seq
        else:
            return self.fc_out(seq.mean(dim = 1))
        
class MambaBlock(nn.Module):
    def __init__(self, d_input, d_model, d_state=16, d_discr=None, ker_size=4):
        super().__init__()
        d_discr = d_discr if d_discr is not None else d_model // 16
        self.in_proj  = nn.Linear(d_input, 2 * d_model, bias=False)
        self.out_proj = nn.Linear(d_model, d_input, bias=False)
        self.s_B = nn.Linear(d_model, d_state, bias=False)
        self.s_C = nn.Linear(d_model, d_state, bias=False)
        self.s_D = nn.Sequential(nn.Linear(d_model, d_discr, bias=False), nn.Linear(d_discr, d_model, bias=False),)
        self.conv = nn.Conv1d(
            in_channels=d_model,
            out_channels=d_model,
            kernel_size=ker_size,
            padding=ker_size - 1,
            groups=d_model,
            bias=True,
        )
        self.A = nn.Parameter(torch.arange(1, d_state + 1, dtype=torch.float).repeat(d_model, 1))
        self.D = nn.Parameter(torch.ones(d_model, dtype=torch.float))
        self.device = DEVICE
        
    def forward(self, seq, cache=None):
        b, l, d = seq.shape
        (prev_hid, prev_inp) = cache if cache is not None else (None, None)
        a, b = self.in_proj(seq).chunk(2, dim=-1)
        x = rearrange(a, 'b l d -> b d l')
        x = x if prev_inp is None else torch.cat((prev_inp, x), dim=-1)
        a = self.conv(x)[..., :l]
        a = rearrange(a, 'b d l -> b l d')
        a = silu(a)
        a, hid = self.ssm(a, prev_hid=prev_hid) 
        b = silu(b)
        out = a * b
        out =  self.out_proj(out)
        if cache:
            cache = (hid.squeeze(), x[..., 1:])   
        return out, cache
    
    def ssm(self, seq, prev_hid):
        A = -self.A
        D = +self.D
        B = self.s_B(seq)
        C = self.s_C(seq)
        s = softplus(D + self.s_D(seq))
        A_bar = einsum(torch.exp(A), s, 'd s,   b l d -> b l d s')
        B_bar = einsum(          B,  s, 'b l s, b l d -> b l d s')
        X_bar = einsum(B_bar, seq, 'b l d s, b l d -> b l d s')
        hid = self._hid_states(A_bar, X_bar, prev_hid=prev_hid)
        out = einsum(hid, C, 'b l d s, b l s -> b l d')
        out = out + D * seq
        return out, hid
    
    def _hid_states(self, A, X, prev_hid=None):
        b, l, d, s = A.shape
        A = rearrange(A, 'b l d s -> l b d s')
        X = rearrange(X, 'b l d s -> l b d s')
        if prev_hid is not None:
            return rearrange(A * prev_hid + X, 'l b d s -> b l d s')
        h = torch.zeros(b, d, s, device=self.device)
        return torch.stack([h := A_t * h + X_t for A_t, X_t in zip(A, X)], dim=1)