Spaces:
Sleeping
Sleeping
Update api.py
Browse files
api.py
CHANGED
@@ -84,61 +84,67 @@ class SwiGLU(tf.keras.layers.Layer):
|
|
84 |
x_val, x_gate = tf.split(x_proj, 2, axis=-1)
|
85 |
return self.out(x_val * tf.nn.silu(x_gate))
|
86 |
|
87 |
-
class
|
88 |
-
def __init__(self, d_model, d_ff, num_heads=8, dropout_rate=0.
|
89 |
-
super().__init__()
|
90 |
-
self.ln1 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
|
91 |
-
self.mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)
|
92 |
-
self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
|
93 |
-
self.adapter_down = tf.keras.layers.Dense(adapter_dim, activation='gelu')
|
94 |
-
self.adapter_up = tf.keras.layers.Dense(d_model)
|
95 |
-
|
96 |
-
self.ln2 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
|
97 |
-
self.ffn = SwiGLU(d_model, d_ff)
|
98 |
-
self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
|
99 |
-
self.rope = RotaryPositionalEmbedding(d_model // num_heads)
|
100 |
-
|
101 |
-
def call(self, x, training=False):
|
102 |
-
x_norm = self.ln1(x)
|
103 |
-
b, s, _ = tf.shape(x_norm)[0], tf.shape(x_norm)[1], tf.shape(x_norm)[2]
|
104 |
-
h = self.mha.num_heads
|
105 |
-
d = x_norm.shape[-1] // h
|
106 |
-
|
107 |
-
qkv = tf.reshape(x_norm, [b, s, h, d])
|
108 |
-
qkv = tf.transpose(qkv, [0, 2, 1, 3])
|
109 |
-
q = self.rope(qkv)
|
110 |
-
k = self.rope(qkv)
|
111 |
-
q = tf.reshape(tf.transpose(q, [0, 2, 1, 3]), [b, s, h * d])
|
112 |
-
k = tf.reshape(tf.transpose(k, [0, 2, 1, 3]), [b, s, h * d])
|
113 |
-
|
114 |
-
attn_out = self.mha(query=q, value=x_norm, key=k, use_causal_mask=True, training=training)
|
115 |
-
attn_out = self.dropout1(attn_out, training=training)
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
ffn_out = self.ffn(self.ln2(x))
|
122 |
-
x = x + self.dropout2(ffn_out, training=training)
|
123 |
-
return x
|
124 |
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
self.ln_f = tf.keras.layers.LayerNormalization(epsilon=1e-5)
|
131 |
-
|
132 |
-
def call(self, x, training=False):
|
133 |
-
x = self.token_embedding(x)
|
134 |
-
for block in self.blocks:
|
135 |
-
x = block(x, training=training)
|
136 |
-
x = self.ln_f(x)
|
137 |
-
logits = tf.matmul(x, self.token_embedding.embeddings, transpose_b=True)
|
138 |
-
return logits
|
139 |
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
dummy_input = tf.zeros((1, max_len), dtype=tf.int32) # 배치1, 시퀀스길이 max_len
|
143 |
_ = model(dummy_input) # 모델이 빌드됨
|
144 |
model.load_weights("Flexi.weights.h5")
|
|
|
84 |
x_val, x_gate = tf.split(x_proj, 2, axis=-1)
|
85 |
return self.out(x_val * tf.nn.silu(x_gate))
|
86 |
|
87 |
+
class Block(tf.keras.layers.Layer):
|
88 |
+
def __init__(self, d_model, d_ff, num_heads=8, dropout_rate=0.05, adapter_dim=64):
|
89 |
+
super().__init__()
|
90 |
+
self.ln1 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
|
91 |
+
self.mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)
|
92 |
+
self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
|
93 |
+
self.adapter_down = tf.keras.layers.Dense(adapter_dim, activation='gelu')
|
94 |
+
self.adapter_up = tf.keras.layers.Dense(d_model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
+
self.ln2 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
|
97 |
+
self.ffn = SwiGLU(d_model, d_ff)
|
98 |
+
self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
|
99 |
+
self.rope = RotaryPositionalEmbedding(d_model // num_heads)
|
|
|
|
|
|
|
100 |
|
101 |
+
def call(self, x, training=False):
|
102 |
+
x_norm = self.ln1(x)
|
103 |
+
b, s, _ = tf.shape(x_norm)[0], tf.shape(x_norm)[1], tf.shape(x_norm)[2]
|
104 |
+
h = self.mha.num_heads
|
105 |
+
d = x_norm.shape[-1] // h
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
+
qkv = tf.reshape(x_norm, [b, s, h, d])
|
108 |
+
qkv = tf.transpose(qkv, [0, 2, 1, 3])
|
109 |
+
q = self.rope(qkv)
|
110 |
+
k = self.rope(qkv)
|
111 |
+
q = tf.reshape(tf.transpose(q, [0, 2, 1, 3]), [b, s, h * d])
|
112 |
+
k = tf.reshape(tf.transpose(k, [0, 2, 1, 3]), [b, s, h * d])
|
113 |
+
|
114 |
+
attn_out = self.mha(query=q, value=x_norm, key=k, use_causal_mask=True, training=training)
|
115 |
+
attn_out = self.dropout1(attn_out, training=training)
|
116 |
+
|
117 |
+
adapter_out = self.adapter_up(self.adapter_down(attn_out))
|
118 |
+
attn_out = attn_out + adapter_out
|
119 |
+
|
120 |
+
x = x + attn_out
|
121 |
+
ffn_out = self.ffn(self.ln2(x))
|
122 |
+
x = x + self.dropout2(ffn_out, training=training)
|
123 |
+
return x
|
124 |
+
|
125 |
+
class Flexi(tf.keras.Model):
|
126 |
+
def __init__(self, vocab_size, seq_len, d_model, d_ff, n_layers, num_heads=8, dropout_rate=0.05):
|
127 |
+
super().__init__()
|
128 |
+
self.token_embedding = tf.keras.layers.Embedding(vocab_size, d_model)
|
129 |
+
self.blocks = [Block(d_model, d_ff, num_heads, dropout_rate) for _ in range(n_layers)]
|
130 |
+
self.ln_f = tf.keras.layers.LayerNormalization(epsilon=1e-5)
|
131 |
+
|
132 |
+
def call(self, x, training=False):
|
133 |
+
x = self.token_embedding(x)
|
134 |
+
for block in self.blocks:
|
135 |
+
x = block(x, training=training)
|
136 |
+
x = self.ln_f(x)
|
137 |
+
logits = tf.matmul(x, self.token_embedding.embeddings, transpose_b=True)
|
138 |
+
return logits
|
139 |
+
|
140 |
+
model = Flexi(
|
141 |
+
vocab_size=vocab_size,
|
142 |
+
seq_len=max_len,
|
143 |
+
d_model=256,
|
144 |
+
d_ff=1024,
|
145 |
+
n_layers=16
|
146 |
+
)
|
147 |
+
|
148 |
dummy_input = tf.zeros((1, max_len), dtype=tf.int32) # 배치1, 시퀀스길이 max_len
|
149 |
_ = model(dummy_input) # 모델이 빌드됨
|
150 |
model.load_weights("Flexi.weights.h5")
|