zakerytclarke commited on
Commit
2d7d97f
·
verified ·
1 Parent(s): ec60e4a

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +88 -43
src/streamlit_app.py CHANGED
@@ -1,5 +1,3 @@
1
- # app.py
2
-
3
  import os
4
 
5
  # ✅ Fix PermissionError on Hugging Face Spaces
@@ -39,6 +37,10 @@ model_type = st.sidebar.selectbox(
39
  )
40
 
41
  temperature = st.sidebar.slider("Sampling Temperature", 0.1, 2.0, 1.0)
 
 
 
 
42
  train_button = st.sidebar.button("Train Model")
43
 
44
  device = torch.device("cpu") # force CPU usage
@@ -74,9 +76,30 @@ def tokenize(text, tokenizer_type):
74
 
75
  tokens = tokenize(text_data, tokenizer_type)
76
  vocab = list(set(tokens))
 
 
 
 
 
 
77
  token_to_idx = {tok: i for i, tok in enumerate(vocab)}
78
  idx_to_token = {i: tok for tok, i in token_to_idx.items()}
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  ###################################
81
  # Models
82
  ###################################
@@ -122,15 +145,17 @@ class FFNN(nn.Module):
122
 
123
  def train_ffnn(tokens, context_size=3, epochs=3):
124
  data = []
125
- for i in range(len(tokens) - context_size):
126
- context = tokens[i:i+context_size-1]
127
- target = tokens[i+context_size-1]
 
 
128
  data.append((
129
- torch.tensor([token_to_idx[tok] for tok in context], device=device),
130
- token_to_idx[target]
131
  ))
132
 
133
- model = FFNN(len(vocab), context_size-1).to(device)
134
  optimizer = optim.Adam(model.parameters(), lr=0.01)
135
  criterion = nn.CrossEntropyLoss()
136
 
@@ -138,28 +163,33 @@ def train_ffnn(tokens, context_size=3, epochs=3):
138
  total_steps = epochs * len(data)
139
  step = 0
140
 
 
 
141
  for epoch in range(epochs):
142
  total_loss = 0
 
143
  for x, y in data:
144
- x = x.unsqueeze(0)
145
  y = torch.tensor([y], device=device)
 
 
146
  out = model(x)
147
  loss = criterion(out, y)
148
- optimizer.zero_grad()
149
  loss.backward()
150
  optimizer.step()
151
- total_loss += loss.item()
152
 
 
153
  step += 1
154
  progress_bar.progress(step / total_steps)
155
 
156
- st.write(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
157
 
158
  progress_bar.empty()
159
  return model
160
 
161
  def ffnn_predict(model, context, temperature=1.0):
162
- x = torch.tensor([token_to_idx.get(tok, 0) for tok in context[-2:]], device=device).unsqueeze(0)
 
163
  with torch.no_grad():
164
  logits = model(x).squeeze()
165
  probs = torch.softmax(logits / temperature, dim=0).cpu().numpy()
@@ -171,11 +201,13 @@ def ffnn_predict(model, context, temperature=1.0):
171
 
172
  def train_dt(tokens, context_size=3):
173
  X, y = [], []
174
- for i in range(len(tokens) - context_size):
175
- context = tokens[i:i+context_size-1]
176
- target = tokens[i+context_size-1]
177
- X.append([token_to_idx[tok] for tok in context])
178
- y.append(token_to_idx[target])
 
 
179
 
180
  with st.spinner("Training Decision Tree..."):
181
  model = DecisionTreeClassifier()
@@ -183,7 +215,8 @@ def train_dt(tokens, context_size=3):
183
  return model
184
 
185
  def dt_predict(model, context):
186
- x = [token_to_idx.get(tok, 0) for tok in context[-2:]]
 
187
  pred = model.predict([x])[0]
188
  return idx_to_token[pred]
189
 
@@ -193,11 +226,13 @@ def dt_predict(model, context):
193
 
194
  def train_gbt(tokens, context_size=3):
195
  X, y = [], []
196
- for i in range(len(tokens) - context_size):
197
- context = tokens[i:i+context_size-1]
198
- target = tokens[i+context_size-1]
199
- X.append([token_to_idx[tok] for tok in context])
200
- y.append(token_to_idx[target])
 
 
201
 
202
  with st.spinner("Training Gradient Boosted Tree..."):
203
  model = GradientBoostingClassifier()
@@ -205,7 +240,8 @@ def train_gbt(tokens, context_size=3):
205
  return model
206
 
207
  def gbt_predict(model, context):
208
- x = [token_to_idx.get(tok, 0) for tok in context[-2:]]
 
209
  pred = model.predict([x])[0]
210
  return idx_to_token[pred]
211
 
@@ -228,12 +264,14 @@ class RNNModel(nn.Module):
228
 
229
  def train_rnn(tokens, context_size=3, epochs=3):
230
  data = []
231
- for i in range(len(tokens) - context_size):
232
- context = tokens[i:i+context_size-1]
233
- target = tokens[i+context_size-1]
 
 
234
  data.append((
235
- torch.tensor([token_to_idx[tok] for tok in context], device=device),
236
- token_to_idx[target]
237
  ))
238
 
239
  model = RNNModel(len(vocab)).to(device)
@@ -244,9 +282,12 @@ def train_rnn(tokens, context_size=3, epochs=3):
244
  total_steps = epochs * len(data)
245
  step = 0
246
 
 
 
247
  for epoch in range(epochs):
248
  total_loss = 0
249
  h = None
 
250
  for x, y in data:
251
  x = x.unsqueeze(0)
252
  y = torch.tensor([y], device=device)
@@ -260,13 +301,14 @@ def train_rnn(tokens, context_size=3, epochs=3):
260
  step += 1
261
  progress_bar.progress(step / total_steps)
262
 
263
- st.write(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
264
 
265
  progress_bar.empty()
266
  return model
267
 
268
  def rnn_predict(model, context, temperature=1.0):
269
- x = torch.tensor([token_to_idx.get(tok, 0) for tok in context[-2:]], device=device).unsqueeze(0)
 
270
  with torch.no_grad():
271
  logits, _ = model(x)
272
  probs = torch.softmax(logits.squeeze() / temperature, dim=0).cpu().numpy()
@@ -277,22 +319,23 @@ def rnn_predict(model, context, temperature=1.0):
277
  ###################################
278
 
279
  if train_button:
280
- st.write(f"Training **{model_type}** model...")
281
 
282
  if model_type == "N-gram":
283
  with st.spinner("Training N-gram model..."):
284
- model = NGramModel(tokens, n=3)
285
  elif model_type == "Feed Forward NN":
286
- model = train_ffnn(tokens)
287
  elif model_type == "Decision Tree":
288
- model = train_dt(tokens)
289
  elif model_type == "Gradient Boosted Tree":
290
- model = train_gbt(tokens)
291
  elif model_type == "RNN":
292
- model = train_rnn(tokens)
293
 
294
  st.session_state["model"] = model
295
  st.session_state["model_type"] = model_type
 
296
  st.success(f"{model_type} model trained.")
297
 
298
  ###################################
@@ -309,16 +352,18 @@ if "model" in st.session_state:
309
  generated = context.copy()
310
 
311
  for _ in range(20):
 
 
312
  if st.session_state["model_type"] == "N-gram":
313
- next_tok = st.session_state["model"].predict(generated, temperature)
314
  elif st.session_state["model_type"] == "Feed Forward NN":
315
- next_tok = ffnn_predict(st.session_state["model"], generated, temperature)
316
  elif st.session_state["model_type"] == "Decision Tree":
317
- next_tok = dt_predict(st.session_state["model"], generated)
318
  elif st.session_state["model_type"] == "Gradient Boosted Tree":
319
- next_tok = gbt_predict(st.session_state["model"], generated)
320
  elif st.session_state["model_type"] == "RNN":
321
- next_tok = rnn_predict(st.session_state["model"], generated, temperature)
322
 
323
  generated.append(next_tok)
324
  if next_tok == "<END>":
 
 
 
1
  import os
2
 
3
  # ✅ Fix PermissionError on Hugging Face Spaces
 
37
  )
38
 
39
  temperature = st.sidebar.slider("Sampling Temperature", 0.1, 2.0, 1.0)
40
+
41
+ # Context size slider (minimum 2)
42
+ context_size = st.sidebar.slider("Context Size (how many tokens to look back)", min_value=2, max_value=10, value=3, step=1)
43
+
44
  train_button = st.sidebar.button("Train Model")
45
 
46
  device = torch.device("cpu") # force CPU usage
 
76
 
77
  tokens = tokenize(text_data, tokenizer_type)
78
  vocab = list(set(tokens))
79
+
80
+ # Add PAD token to vocab for padding contexts shorter than context_size - 1
81
+ PAD_TOKEN = "<PAD>"
82
+ if PAD_TOKEN not in vocab:
83
+ vocab.append(PAD_TOKEN)
84
+
85
  token_to_idx = {tok: i for i, tok in enumerate(vocab)}
86
  idx_to_token = {i: tok for tok, i in token_to_idx.items()}
87
 
88
+ ###################################
89
+ # Helper to pad context
90
+ ###################################
91
+
92
+ def pad_context(context, size):
93
+ """
94
+ Pads the context list at the front with PAD_TOKEN if length < size,
95
+ or truncates to last `size` tokens if longer.
96
+ """
97
+ pad_len = size - len(context)
98
+ if pad_len > 0:
99
+ return [PAD_TOKEN]*pad_len + context
100
+ else:
101
+ return context[-size:]
102
+
103
  ###################################
104
  # Models
105
  ###################################
 
145
 
146
  def train_ffnn(tokens, context_size=3, epochs=3):
147
  data = []
148
+ for i in range(len(tokens)):
149
+ start_idx = i - (context_size - 1)
150
+ context = tokens[start_idx:i] if start_idx >= 0 else tokens[0:i]
151
+ context = pad_context(context, context_size - 1)
152
+ target = tokens[i]
153
  data.append((
154
+ torch.tensor([token_to_idx.get(t, token_to_idx[PAD_TOKEN]) for t in context], device=device),
155
+ token_to_idx.get(target, token_to_idx[PAD_TOKEN])
156
  ))
157
 
158
+ model = FFNN(len(vocab), context_size - 1).to(device)
159
  optimizer = optim.Adam(model.parameters(), lr=0.01)
160
  criterion = nn.CrossEntropyLoss()
161
 
 
163
  total_steps = epochs * len(data)
164
  step = 0
165
 
166
+ model.train()
167
+
168
  for epoch in range(epochs):
169
  total_loss = 0
170
+ random.shuffle(data)
171
  for x, y in data:
172
+ x = x.unsqueeze(0) # batch size 1
173
  y = torch.tensor([y], device=device)
174
+
175
+ optimizer.zero_grad()
176
  out = model(x)
177
  loss = criterion(out, y)
 
178
  loss.backward()
179
  optimizer.step()
 
180
 
181
+ total_loss += loss.item()
182
  step += 1
183
  progress_bar.progress(step / total_steps)
184
 
185
+ st.write(f"Epoch {epoch+1}, Loss: {total_loss/len(data):.4f}")
186
 
187
  progress_bar.empty()
188
  return model
189
 
190
  def ffnn_predict(model, context, temperature=1.0):
191
+ context = pad_context(context, context_size - 1)
192
+ x = torch.tensor([token_to_idx.get(tok, token_to_idx[PAD_TOKEN]) for tok in context], device=device).unsqueeze(0)
193
  with torch.no_grad():
194
  logits = model(x).squeeze()
195
  probs = torch.softmax(logits / temperature, dim=0).cpu().numpy()
 
201
 
202
  def train_dt(tokens, context_size=3):
203
  X, y = [], []
204
+ for i in range(len(tokens)):
205
+ start_idx = i - (context_size - 1)
206
+ context = tokens[start_idx:i] if start_idx >= 0 else tokens[0:i]
207
+ context = pad_context(context, context_size - 1)
208
+ target = tokens[i]
209
+ X.append([token_to_idx.get(t, token_to_idx[PAD_TOKEN]) for t in context])
210
+ y.append(token_to_idx.get(target, token_to_idx[PAD_TOKEN]))
211
 
212
  with st.spinner("Training Decision Tree..."):
213
  model = DecisionTreeClassifier()
 
215
  return model
216
 
217
  def dt_predict(model, context):
218
+ context = pad_context(context, context_size - 1)
219
+ x = [token_to_idx.get(tok, token_to_idx[PAD_TOKEN]) for tok in context]
220
  pred = model.predict([x])[0]
221
  return idx_to_token[pred]
222
 
 
226
 
227
  def train_gbt(tokens, context_size=3):
228
  X, y = [], []
229
+ for i in range(len(tokens)):
230
+ start_idx = i - (context_size - 1)
231
+ context = tokens[start_idx:i] if start_idx >= 0 else tokens[0:i]
232
+ context = pad_context(context, context_size - 1)
233
+ target = tokens[i]
234
+ X.append([token_to_idx.get(t, token_to_idx[PAD_TOKEN]) for t in context])
235
+ y.append(token_to_idx.get(target, token_to_idx[PAD_TOKEN]))
236
 
237
  with st.spinner("Training Gradient Boosted Tree..."):
238
  model = GradientBoostingClassifier()
 
240
  return model
241
 
242
  def gbt_predict(model, context):
243
+ context = pad_context(context, context_size - 1)
244
+ x = [token_to_idx.get(tok, token_to_idx[PAD_TOKEN]) for tok in context]
245
  pred = model.predict([x])[0]
246
  return idx_to_token[pred]
247
 
 
264
 
265
  def train_rnn(tokens, context_size=3, epochs=3):
266
  data = []
267
+ for i in range(len(tokens)):
268
+ start_idx = i - (context_size - 1)
269
+ context = tokens[start_idx:i] if start_idx >= 0 else tokens[0:i]
270
+ context = pad_context(context, context_size - 1)
271
+ target = tokens[i]
272
  data.append((
273
+ torch.tensor([token_to_idx.get(t, token_to_idx[PAD_TOKEN]) for t in context], device=device),
274
+ token_to_idx.get(target, token_to_idx[PAD_TOKEN])
275
  ))
276
 
277
  model = RNNModel(len(vocab)).to(device)
 
282
  total_steps = epochs * len(data)
283
  step = 0
284
 
285
+ model.train()
286
+
287
  for epoch in range(epochs):
288
  total_loss = 0
289
  h = None
290
+ random.shuffle(data)
291
  for x, y in data:
292
  x = x.unsqueeze(0)
293
  y = torch.tensor([y], device=device)
 
301
  step += 1
302
  progress_bar.progress(step / total_steps)
303
 
304
+ st.write(f"Epoch {epoch+1}, Loss: {total_loss/len(data):.4f}")
305
 
306
  progress_bar.empty()
307
  return model
308
 
309
  def rnn_predict(model, context, temperature=1.0):
310
+ context = pad_context(context, context_size - 1)
311
+ x = torch.tensor([token_to_idx.get(tok, token_to_idx[PAD_TOKEN]) for tok in context], device=device).unsqueeze(0)
312
  with torch.no_grad():
313
  logits, _ = model(x)
314
  probs = torch.softmax(logits.squeeze() / temperature, dim=0).cpu().numpy()
 
319
  ###################################
320
 
321
  if train_button:
322
+ st.write(f"Training **{model_type}** model with context size {context_size}...")
323
 
324
  if model_type == "N-gram":
325
  with st.spinner("Training N-gram model..."):
326
+ model = NGramModel(tokens, n=context_size)
327
  elif model_type == "Feed Forward NN":
328
+ model = train_ffnn(tokens, context_size=context_size)
329
  elif model_type == "Decision Tree":
330
+ model = train_dt(tokens, context_size=context_size)
331
  elif model_type == "Gradient Boosted Tree":
332
+ model = train_gbt(tokens, context_size=context_size)
333
  elif model_type == "RNN":
334
+ model = train_rnn(tokens, context_size=context_size)
335
 
336
  st.session_state["model"] = model
337
  st.session_state["model_type"] = model_type
338
+ st.session_state["context_size"] = context_size
339
  st.success(f"{model_type} model trained.")
340
 
341
  ###################################
 
352
  generated = context.copy()
353
 
354
  for _ in range(20):
355
+ ctx = pad_context(generated, st.session_state["context_size"] - 1)
356
+
357
  if st.session_state["model_type"] == "N-gram":
358
+ next_tok = st.session_state["model"].predict(ctx, temperature)
359
  elif st.session_state["model_type"] == "Feed Forward NN":
360
+ next_tok = ffnn_predict(st.session_state["model"], ctx, temperature)
361
  elif st.session_state["model_type"] == "Decision Tree":
362
+ next_tok = dt_predict(st.session_state["model"], ctx)
363
  elif st.session_state["model_type"] == "Gradient Boosted Tree":
364
+ next_tok = gbt_predict(st.session_state["model"], ctx)
365
  elif st.session_state["model_type"] == "RNN":
366
+ next_tok = rnn_predict(st.session_state["model"], ctx, temperature)
367
 
368
  generated.append(next_tok)
369
  if next_tok == "<END>":