Spaces:

zakerytclarke
/

diy-language-model

Sleeping

App Files Files Community

zakerytclarke commited on Jun 29

Commit

dc3a847

verified ·

1 Parent(s): cd0c9a0

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +327 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,329 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+# streamlit_app.py
 import streamlit as st
+from datasets import load_dataset
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from collections import defaultdict, Counter
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+import random
+st.title("🧠 Language Model Explorer")
+###################################
+# Sidebar configuration
+###################################
+dataset_name = st.sidebar.selectbox(
+    "Choose Dataset",
+    ["squad", "tiny_shakespeare"]
+)
+tokenizer_type = st.sidebar.selectbox(
+    "Choose Tokenizer",
+    ["character", "word"]
+)
+model_type = st.sidebar.selectbox(
+    "Choose Model",
+    ["N-gram", "Feed Forward NN", "Decision Tree", "Gradient Boosted Tree", "RNN"]
+)
+temperature = st.sidebar.slider("Sampling Temperature", 0.1, 2.0, 1.0)
+train_button = st.sidebar.button("Train Model")
+device = torch.device("cpu")  # force CPU usage
+###################################
+# Load dataset
+###################################
+@st.cache_data
+def load_text(dataset_name):
+    if dataset_name == "squad":
+        data = load_dataset("squad", split="train[:1%]")
+        texts = [x['context'] for x in data]
+    elif dataset_name == "tiny_shakespeare":
+        data = load_dataset("tiny_shakespeare")
+        texts = [data['train'][0]['text']]
+    else:
+        texts = ["hello world"]
+    return " ".join(texts)
+text_data = load_text(dataset_name)
+###################################
+# Tokenization
+###################################
+def tokenize(text, tokenizer_type):
+    if tokenizer_type == "character":
+        tokens = list(text)
+    elif tokenizer_type == "word":
+        tokens = text.split()
+    return tokens
+tokens = tokenize(text_data, tokenizer_type)
+vocab = list(set(tokens))
+token_to_idx = {tok: i for i, tok in enumerate(vocab)}
+idx_to_token = {i: tok for tok, i in token_to_idx.items()}
+###################################
+# Models
+###################################
+class NGramModel:
+    def __init__(self, tokens, n=3):
+        self.n = n
+        self.model = defaultdict(Counter)
+        for i in range(len(tokens) - n):
+            context = tuple(tokens[i:i+n-1])
+            next_token = tokens[i+n-1]
+            self.model[context][next_token] += 1
+    def predict(self, context, temperature=1.0):
+        context = tuple(context[-(self.n-1):])
+        counts = self.model.get(context, None)
+        if counts is None:
+            return random.choice(list(token_to_idx.keys()))
+        items = list(counts.items())
+        tokens_, freqs = zip(*items)
+        probs = np.array(freqs, dtype=float)
+        probs = probs ** (1.0 / temperature)
+        probs /= probs.sum()
+        return np.random.choice(tokens_, p=probs)
+###################################
+# Feed Forward NN
+###################################
+class FFNN(nn.Module):
+    def __init__(self, vocab_size, context_size, hidden_size=128):
+        super().__init__()
+        self.embed = nn.Embedding(vocab_size, hidden_size)
+        self.fc1 = nn.Linear(hidden_size * context_size, hidden_size)
+        self.fc2 = nn.Linear(hidden_size, vocab_size)
+    def forward(self, x):
+        x = self.embed(x)
+        x = x.view(x.size(0), -1)
+        x = torch.relu(self.fc1(x))
+        x = self.fc2(x)
+        return x
+def train_ffnn(tokens, context_size=3, epochs=3):
+    data = []
+    for i in range(len(tokens) - context_size):
+        context = tokens[i:i+context_size-1]
+        target = tokens[i+context_size-1]
+        data.append((
+            torch.tensor([token_to_idx[tok] for tok in context], device=device),
+            token_to_idx[target]
+        ))
+    model = FFNN(len(vocab), context_size-1).to(device)
+    optimizer = optim.Adam(model.parameters(), lr=0.01)
+    criterion = nn.CrossEntropyLoss()
+    progress_bar = st.progress(0)
+    total_steps = epochs * len(data)
+    step = 0
+    for epoch in range(epochs):
+        total_loss = 0
+        for x, y in data:
+            x = x.unsqueeze(0)
+            y = torch.tensor([y], device=device)
+            out = model(x)
+            loss = criterion(out, y)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            step += 1
+            progress_bar.progress(step / total_steps)
+        st.write(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
+    progress_bar.empty()
+    return model
+def ffnn_predict(model, context, temperature=1.0):
+    x = torch.tensor([token_to_idx.get(tok, 0) for tok in context[-2:]], device=device).unsqueeze(0)
+    with torch.no_grad():
+        logits = model(x).squeeze()
+        probs = torch.softmax(logits / temperature, dim=0).cpu().numpy()
+        return np.random.choice(vocab, p=probs)
+###################################
+# Decision Tree
+###################################
+def train_dt(tokens, context_size=3):
+    X, y = [], []
+    for i in range(len(tokens) - context_size):
+        context = tokens[i:i+context_size-1]
+        target = tokens[i+context_size-1]
+        X.append([token_to_idx[tok] for tok in context])
+        y.append(token_to_idx[target])
+    with st.spinner("Training Decision Tree..."):
+        model = DecisionTreeClassifier()
+        model.fit(X, y)
+    return model
+def dt_predict(model, context):
+    x = [token_to_idx.get(tok, 0) for tok in context[-2:]]
+    pred = model.predict([x])[0]
+    return idx_to_token[pred]
+###################################
+# Gradient Boosted Tree
+###################################
+def train_gbt(tokens, context_size=3):
+    X, y = [], []
+    for i in range(len(tokens) - context_size):
+        context = tokens[i:i+context_size-1]
+        target = tokens[i+context_size-1]
+        X.append([token_to_idx[tok] for tok in context])
+        y.append(token_to_idx[target])
+    with st.spinner("Training Gradient Boosted Tree..."):
+        model = GradientBoostingClassifier()
+        model.fit(X, y)
+    return model
+def gbt_predict(model, context):
+    x = [token_to_idx.get(tok, 0) for tok in context[-2:]]
+    pred = model.predict([x])[0]
+    return idx_to_token[pred]
+###################################
+# RNN
+###################################
+class RNNModel(nn.Module):
+    def __init__(self, vocab_size, embed_size=64, hidden_size=128):
+        super().__init__()
+        self.embed = nn.Embedding(vocab_size, embed_size)
+        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
+        self.fc = nn.Linear(hidden_size, vocab_size)
+    def forward(self, x, h=None):
+        x = self.embed(x)
+        out, h = self.rnn(x, h)
+        out = self.fc(out[:, -1, :])
+        return out, h
+def train_rnn(tokens, context_size=3, epochs=3):
+    data = []
+    for i in range(len(tokens) - context_size):
+        context = tokens[i:i+context_size-1]
+        target = tokens[i+context_size-1]
+        data.append((
+            torch.tensor([token_to_idx[tok] for tok in context], device=device),
+            token_to_idx[target]
+        ))
+    model = RNNModel(len(vocab)).to(device)
+    optimizer = optim.Adam(model.parameters(), lr=0.01)
+    criterion = nn.CrossEntropyLoss()
+    progress_bar = st.progress(0)
+    total_steps = epochs * len(data)
+    step = 0
+    for epoch in range(epochs):
+        total_loss = 0
+        h = None
+        for x, y in data:
+            x = x.unsqueeze(0)
+            y = torch.tensor([y], device=device)
+            out, h = model(x, h)
+            loss = criterion(out, y)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            step += 1
+            progress_bar.progress(step / total_steps)
+        st.write(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
+    progress_bar.empty()
+    return model
+def rnn_predict(model, context, temperature=1.0):
+    x = torch.tensor([token_to_idx.get(tok, 0) for tok in context[-2:]], device=device).unsqueeze(0)
+    with torch.no_grad():
+        logits, _ = model(x)
+        probs = torch.softmax(logits.squeeze() / temperature, dim=0).cpu().numpy()
+        return np.random.choice(vocab, p=probs)
+###################################
+# Train and evaluate
+###################################
+if train_button:
+    st.write(f"Training **{model_type}** model...")
+    if model_type == "N-gram":
+        with st.spinner("Training N-gram model..."):
+            model = NGramModel(tokens, n=3)
+    elif model_type == "Feed Forward NN":
+        model = train_ffnn(tokens)
+    elif model_type == "Decision Tree":
+        model = train_dt(tokens)
+    elif model_type == "Gradient Boosted Tree":
+        model = train_gbt(tokens)
+    elif model_type == "RNN":
+        model = train_rnn(tokens)
+    st.session_state["model"] = model
+    st.session_state["model_type"] = model_type
+    st.success(f"{model_type} model trained.")
+###################################
+# Chat interface
+###################################
+st.header("💬 Chat with the model")
+if "model" in st.session_state:
+    user_input = st.text_input("Type a prompt:")
+    if user_input:
+        context = tokenize(user_input, tokenizer_type)
+        generated = context.copy()
+        for _ in range(20):
+            if st.session_state["model_type"] == "N-gram":
+                next_tok = st.session_state["model"].predict(generated, temperature)
+            elif st.session_state["model_type"] == "Feed Forward NN":
+                next_tok = ffnn_predict(st.session_state["model"], generated, temperature)
+            elif st.session_state["model_type"] == "Decision Tree":
+                next_tok = dt_predict(st.session_state["model"], generated)
+            elif st.session_state["model_type"] == "Gradient Boosted Tree":
+                next_tok = gbt_predict(st.session_state["model"], generated)
+            elif st.session_state["model_type"] == "RNN":
+                next_tok = rnn_predict(st.session_state["model"], generated, temperature)
+            generated.append(next_tok)
+            if next_tok == "<END>":
+                break
+        if tokenizer_type == "character":
+            output = "".join(generated)
+        else:
+            output = " ".join(generated)
+        st.write("**Model Output:**")
+        st.write(output)
+else:
+    st.info("Train a model to begin chatting.")