Yuchan5386 commited on
Commit
959a616
·
verified ·
1 Parent(s): 77d5918

Update api.py

Browse files
Files changed (1) hide show
  1. api.py +58 -52
api.py CHANGED
@@ -84,61 +84,67 @@ class SwiGLU(tf.keras.layers.Layer):
84
  x_val, x_gate = tf.split(x_proj, 2, axis=-1)
85
  return self.out(x_val * tf.nn.silu(x_gate))
86
 
87
- class GPTBlock(tf.keras.layers.Layer):
88
- def __init__(self, d_model, d_ff, num_heads=8, dropout_rate=0.1, adapter_dim=64):
89
- super().__init__()
90
- self.ln1 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
91
- self.mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)
92
- self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
93
- self.adapter_down = tf.keras.layers.Dense(adapter_dim, activation='gelu')
94
- self.adapter_up = tf.keras.layers.Dense(d_model)
95
-
96
- self.ln2 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
97
- self.ffn = SwiGLU(d_model, d_ff)
98
- self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
99
- self.rope = RotaryPositionalEmbedding(d_model // num_heads)
100
-
101
- def call(self, x, training=False):
102
- x_norm = self.ln1(x)
103
- b, s, _ = tf.shape(x_norm)[0], tf.shape(x_norm)[1], tf.shape(x_norm)[2]
104
- h = self.mha.num_heads
105
- d = x_norm.shape[-1] // h
106
-
107
- qkv = tf.reshape(x_norm, [b, s, h, d])
108
- qkv = tf.transpose(qkv, [0, 2, 1, 3])
109
- q = self.rope(qkv)
110
- k = self.rope(qkv)
111
- q = tf.reshape(tf.transpose(q, [0, 2, 1, 3]), [b, s, h * d])
112
- k = tf.reshape(tf.transpose(k, [0, 2, 1, 3]), [b, s, h * d])
113
-
114
- attn_out = self.mha(query=q, value=x_norm, key=k, use_causal_mask=True, training=training)
115
- attn_out = self.dropout1(attn_out, training=training)
116
 
117
- adapter_out = self.adapter_up(self.adapter_down(attn_out))
118
- attn_out = attn_out + adapter_out
119
-
120
- x = x + attn_out
121
- ffn_out = self.ffn(self.ln2(x))
122
- x = x + self.dropout2(ffn_out, training=training)
123
- return x
124
 
125
- class InteractGPT(tf.keras.Model):
126
- def __init__(self, vocab_size, seq_len, d_model, d_ff, n_layers, num_heads=8, dropout_rate=0.1):
127
- super().__init__()
128
- self.token_embedding = tf.keras.layers.Embedding(vocab_size, d_model)
129
- self.blocks = [GPTBlock(d_model, d_ff, num_heads, dropout_rate) for _ in range(n_layers)]
130
- self.ln_f = tf.keras.layers.LayerNormalization(epsilon=1e-5)
131
-
132
- def call(self, x, training=False):
133
- x = self.token_embedding(x)
134
- for block in self.blocks:
135
- x = block(x, training=training)
136
- x = self.ln_f(x)
137
- logits = tf.matmul(x, self.token_embedding.embeddings, transpose_b=True)
138
- return logits
139
 
140
- model = InteractGPT(vocab_size=vocab_size, seq_len=max_len, d_model=256, d_ff=1024, n_layers=6)
141
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  dummy_input = tf.zeros((1, max_len), dtype=tf.int32) # 배치1, 시퀀스길이 max_len
143
  _ = model(dummy_input) # 모델이 빌드됨
144
  model.load_weights("Flexi.weights.h5")
 
84
  x_val, x_gate = tf.split(x_proj, 2, axis=-1)
85
  return self.out(x_val * tf.nn.silu(x_gate))
86
 
87
+ class Block(tf.keras.layers.Layer):
88
+ def __init__(self, d_model, d_ff, num_heads=8, dropout_rate=0.05, adapter_dim=64):
89
+ super().__init__()
90
+ self.ln1 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
91
+ self.mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)
92
+ self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
93
+ self.adapter_down = tf.keras.layers.Dense(adapter_dim, activation='gelu')
94
+ self.adapter_up = tf.keras.layers.Dense(d_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ self.ln2 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
97
+ self.ffn = SwiGLU(d_model, d_ff)
98
+ self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
99
+ self.rope = RotaryPositionalEmbedding(d_model // num_heads)
 
 
 
100
 
101
+ def call(self, x, training=False):
102
+ x_norm = self.ln1(x)
103
+ b, s, _ = tf.shape(x_norm)[0], tf.shape(x_norm)[1], tf.shape(x_norm)[2]
104
+ h = self.mha.num_heads
105
+ d = x_norm.shape[-1] // h
 
 
 
 
 
 
 
 
 
106
 
107
+ qkv = tf.reshape(x_norm, [b, s, h, d])
108
+ qkv = tf.transpose(qkv, [0, 2, 1, 3])
109
+ q = self.rope(qkv)
110
+ k = self.rope(qkv)
111
+ q = tf.reshape(tf.transpose(q, [0, 2, 1, 3]), [b, s, h * d])
112
+ k = tf.reshape(tf.transpose(k, [0, 2, 1, 3]), [b, s, h * d])
113
+
114
+ attn_out = self.mha(query=q, value=x_norm, key=k, use_causal_mask=True, training=training)
115
+ attn_out = self.dropout1(attn_out, training=training)
116
+
117
+ adapter_out = self.adapter_up(self.adapter_down(attn_out))
118
+ attn_out = attn_out + adapter_out
119
+
120
+ x = x + attn_out
121
+ ffn_out = self.ffn(self.ln2(x))
122
+ x = x + self.dropout2(ffn_out, training=training)
123
+ return x
124
+
125
+ class Flexi(tf.keras.Model):
126
+ def __init__(self, vocab_size, seq_len, d_model, d_ff, n_layers, num_heads=8, dropout_rate=0.05):
127
+ super().__init__()
128
+ self.token_embedding = tf.keras.layers.Embedding(vocab_size, d_model)
129
+ self.blocks = [Block(d_model, d_ff, num_heads, dropout_rate) for _ in range(n_layers)]
130
+ self.ln_f = tf.keras.layers.LayerNormalization(epsilon=1e-5)
131
+
132
+ def call(self, x, training=False):
133
+ x = self.token_embedding(x)
134
+ for block in self.blocks:
135
+ x = block(x, training=training)
136
+ x = self.ln_f(x)
137
+ logits = tf.matmul(x, self.token_embedding.embeddings, transpose_b=True)
138
+ return logits
139
+
140
+ model = Flexi(
141
+ vocab_size=vocab_size,
142
+ seq_len=max_len,
143
+ d_model=256,
144
+ d_ff=1024,
145
+ n_layers=16
146
+ )
147
+
148
  dummy_input = tf.zeros((1, max_len), dtype=tf.int32) # 배치1, 시퀀스길이 max_len
149
  _ = model(dummy_input) # 모델이 빌드됨
150
  model.load_weights("Flexi.weights.h5")