Spaces:

Ruurd
/

lad

Running on Zero

App Files Files

Ruurd commited on Apr 8

Commit

7252f98

1 Parent(s): acbd7fa

First commit

Browse files

Files changed (4) hide show

app.py +158 -0
llama_diffusion_model.py +134 -0
requirements.txt +7 -0
token_probabilities.json +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import gradio as gr
+import torch
+import numpy as np
+import json
+import time
+from transformers import AutoTokenizer
+from llama_diffusion_model import CustomTransformerModel, CustomTransformerConfig, disable_dropout
+import os
+hf_token = os.getenv("HF_TOKEN")
+# --- Load tokenizer ---
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B", use_fast=True, token=hf_token)
+vocab_size = len(tokenizer)
+pad_token = tokenizer.pad_token_id or tokenizer.eos_token_id
+eot_token_id = tokenizer.eos_token_id
+assistant_marker_ids = tokenizer.encode("Assistant:", add_special_tokens=False)
+# --- Load token probabilities ---
+with open("token_probabilities.json") as f:
+    token_probs_dict = json.load(f)
+token_probabilities = np.array([token_probs_dict[str(i)] for i in range(len(token_probs_dict))], dtype=np.float32)
+def load_model():
+    config = CustomTransformerConfig(vocab_size=vocab_size)
+    model = CustomTransformerModel(config)
+    model.load_state_dict(torch.hub.load_state_dict_from_url(
+        "https://huggingface.co/Ruurd/tini_model/resolve/main/diffusion-model.pth",
+        map_location="cuda",
+        headers={"Authorization": f"Bearer {hf_token}"}
+    ))
+    model = disable_dropout(model)
+    model.to("cuda")
+    model.eval()
+    return model
+rng = np.random.default_rng()
+# --- Utility Functions ---
+def decode_tokens_safe(token_ids):
+    return tokenizer.decode(token_ids, skip_special_tokens=True).replace("\n", " ")
+def find_answer_start(input_ids, marker_ids):
+    for i in range(len(input_ids) - len(marker_ids) + 1):
+        if input_ids[i:i + len(marker_ids)] == marker_ids:
+            return i + len(marker_ids)
+    return None
+def get_noising_schedule(i, max_it, sharpness=5.0):
+    x = i / max_it
+    return (np.exp(-sharpness * x) - np.exp(-sharpness)) / (1 - np.exp(-sharpness))
+def noisify_answer(input_ids, answer_start, threshold=1.0, eot_weight=1.0):
+    noised = input_ids.copy()
+    answer_len = len(input_ids) - answer_start
+    num_to_noise = int(threshold * answer_len)
+    if num_to_noise > 0:
+        indices = rng.choice(np.arange(answer_start, len(input_ids)), size=num_to_noise, replace=False)
+        mixed_probs = token_probabilities.copy()
+        mixed_probs[eot_token_id] *= eot_weight
+        mixed_probs /= mixed_probs.sum()
+        noise = rng.choice(np.arange(vocab_size), size=num_to_noise, p=mixed_probs)
+        for idx, val in zip(indices, noise):
+            noised[idx] = val
+    return noised
+def generate_diffusion_text(model, input_ids, answer_start):
+    with torch.no_grad():
+        input_tensor = torch.tensor([input_ids], dtype=torch.long).to(model.device)
+        logits = model(input_ids=input_tensor)["logits"]
+        probs = torch.nn.functional.softmax(logits, dim=-1).squeeze()
+        probs = torch.clamp(probs, min=1e-8, max=1.0)
+        sampled = torch.multinomial(probs, num_samples=1).squeeze().tolist()
+    return input_ids[:answer_start] + sampled[answer_start:]
+# --- Inference Wrapper ---
+def diffusion_chat(question, eot_weight, max_it, sharpness, model):
+    placeholder = "What do you know about the city of New York?"
+    if question.strip() == "":
+        question = placeholder
+    prompt = f"User: {question}\nAssistant:"
+    input_ids = tokenizer.encode(prompt, add_special_tokens=False)
+    answer_start = find_answer_start(input_ids, assistant_marker_ids)
+    if answer_start is None:
+        yield "Error: Could not find Assistant marker in input."
+        return
+    if len(input_ids) < 256:
+        input_ids += [pad_token] * (256 - len(input_ids))
+    else:
+        input_ids = input_ids[:256]
+    ori_input_tokens = input_ids
+    current_tokens = noisify_answer(ori_input_tokens, answer_start, threshold=1.0, eot_weight=eot_weight)
+    prev_decoded_tokens = []
+    last_tokens = []
+    for i in range(max_it):
+        generated_tokens = generate_diffusion_text(model, current_tokens, answer_start)
+        current_tokens = generated_tokens
+        decoded_ids = current_tokens[answer_start:]
+        decoded_tokens = tokenizer.convert_ids_to_tokens(decoded_ids)
+        filtered_tokens = [tok for tok in decoded_tokens if tokenizer.convert_tokens_to_ids(tok) != eot_token_id]
+        filtered_prev_tokens = [tok for tok in prev_decoded_tokens if tokenizer.convert_tokens_to_ids(tok) != eot_token_id] if prev_decoded_tokens else []
+        if filtered_prev_tokens:
+            highlighted = []
+            for tok_new, tok_old in zip(filtered_tokens, filtered_prev_tokens):
+                if tok_new != tok_old:
+                    highlighted.append(f'<span style="color:green">{tokenizer.convert_tokens_to_string([tok_new])}</span>')
+                else:
+                    highlighted.append(tokenizer.convert_tokens_to_string([tok_new]))
+        else:
+            highlighted = [tokenizer.convert_tokens_to_string([tok]) for tok in filtered_tokens]
+        prev_decoded_tokens = decoded_tokens
+        yield f"<b>Iteration {i+1}/{max_it} (running):</b><br>" + "".join(highlighted)
+        last_tokens.append(generated_tokens)
+        if len(last_tokens) > 3:
+            last_tokens.pop(0)
+        if len(last_tokens) == 3 and last_tokens[0] == last_tokens[1] == last_tokens[2]:
+            yield f"<b>Stopped early after {i+1} iterations.</b>"
+            break
+        threshold = get_noising_schedule(i, max_it, sharpness=sharpness)
+        current_tokens = noisify_answer(generated_tokens, answer_start, threshold=threshold, eot_weight=eot_weight)
+        time.sleep(0.01)
+    final_tokens = tokenizer.convert_ids_to_tokens(current_tokens[answer_start:])
+    final_tokens = [tok for tok in final_tokens if tokenizer.convert_tokens_to_ids(tok) != eot_token_id]
+    final_output = tokenizer.convert_tokens_to_string(final_tokens)
+    yield f"<b>Final Output (after {i+1} iterations):</b><br>" + final_output
+# --- Gradio Interface ---
+model_state = gr.State(load_model())
+demo = gr.Interface(
+    fn=diffusion_chat,
+    inputs=[
+        gr.Textbox(label="User Question", lines=2, placeholder="What do you know about the city of New York?"),
+        gr.Slider(0, 1, value=0.4, step=0.05, label="↓ = longer answers (EOT weight)"),
+        gr.Slider(1, 512, value=64, step=1, label="↑ = more iterations"),
+        gr.Slider(1.0, 20.0, value=5.0, step=0.5, label="↓ = more noising (sharpness)"),
+        model_state
+    ],
+    outputs=gr.HTML(label="Diffusion Output"),
+    title="Diffusion Language Model Chat",
+    description="This interface runs a diffusion-based language model to generate answers progressively."
+)
+demo.launch()

llama_diffusion_model.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.amp import autocast
+from transformers import AutoModelForCausalLM, PreTrainedModel, PretrainedConfig
+from peft import LoraConfig, get_peft_model
+import os
+hf_token = os.getenv("HF_TOKEN")
+class BidirectionalLlamaAttention(nn.Module):
+    def __init__(self, original_layer, masking='unidirectional'):
+        super().__init__()
+        self.original = original_layer
+        self.masking = masking
+        self.q_proj = original_layer.q_proj
+        self.k_proj = original_layer.k_proj
+        self.v_proj = original_layer.v_proj
+        self.o_proj = original_layer.o_proj
+        self.head_dim = self.q_proj.out_features // original_layer.num_heads
+        self.num_heads = original_layer.num_heads
+        self.num_key_value_groups = original_layer.num_key_value_groups
+        self.attention_dropout = original_layer.attention_dropout
+        self.layer_idx = original_layer.layer_idx
+        self.scaling = original_layer.scaling
+    def forward(self, hidden_states, position_embeddings, attention_mask=None, past_key_value=None, cache_position=None, **kwargs):
+        bsz, seq_len, _ = hidden_states.size()
+        query_states = self._split_heads(self.q_proj(hidden_states))
+        key_states = self._split_heads(self.k_proj(hidden_states))
+        value_states = self._split_heads(self.v_proj(hidden_states))
+        cos, sin = position_embeddings
+        query_states, key_states = self._apply_rotary(query_states, key_states, cos, sin)
+        if self.masking == 'bidirectional':
+            attn_mask = torch.ones((bsz, 1, seq_len, seq_len), device=hidden_states.device)
+        else:
+            attn_mask = torch.tril(torch.ones(seq_len, seq_len, device=hidden_states.device)).unsqueeze(0).unsqueeze(0)
+        attn_weights = torch.matmul(query_states, key_states.transpose(-2, -1)) * self.scaling
+        attn_weights = attn_weights + attn_mask.log()
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_weights = F.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        return self.o_proj(attn_output), attn_weights
+    def _split_heads(self, x):
+        return x.view(x.size(0), x.size(1), self.num_heads, self.head_dim).transpose(1, 2)
+    def _merge_heads(self, x):
+        return x.transpose(1, 2).contiguous().view(x.size(0), -1, self.num_heads * self.head_dim)
+    def _apply_rotary(self, q, k, cos, sin):
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+        q_rot = (q * cos) + (self._rotate_half(q) * sin)
+        k_rot = (k * cos) + (self._rotate_half(k) * sin)
+        return q_rot, k_rot
+    def _rotate_half(self, x):
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+class CustomTransformerConfig(PretrainedConfig):
+    def __init__(self, vocab_size=128256, hidden_size=4096, num_layers=32, num_heads=32, prediction_chunk=256, dropout=0, max_position_embeddings=4096, **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.prediction_chunk = prediction_chunk
+        self.max_position_embeddings = max_position_embeddings
+class CustomTransformerModel(PreTrainedModel):
+    config_class = CustomTransformerConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.llama = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B", torch_dtype=torch.float16, token=hf_token)
+        self.llama.resize_token_embeddings(config.vocab_size)
+        for i, layer in enumerate(self.llama.model.layers):
+            layer.self_attn = BidirectionalLlamaAttention(layer.self_attn, masking='bidirectional')
+        for param in self.llama.parameters():
+            param.requires_grad = False
+        for param in self.llama.lm_head.parameters():
+            param.requires_grad = True
+        lora_config = LoraConfig(
+            r=256,
+            lora_alpha=256,
+            lora_dropout=0.0,
+            target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
+            bias="none",
+            task_type=None
+        )
+        self.llama = get_peft_model(self.llama, lora_config)
+        self.llama = self.llama.to(torch.float16)
+    def forward(self, input_ids, labels=None, **kwargs):
+        batch_size, seq_length = input_ids.shape
+        assert seq_length == self.config.prediction_chunk
+        with autocast("cuda", dtype=torch.float16):
+            outputs = self.llama(input_ids=input_ids, output_hidden_states=True, **kwargs)
+            logits = outputs.logits[:, :, :self.config.vocab_size].view(batch_size, self.config.prediction_chunk, self.config.vocab_size)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
+def disable_dropout(model):
+    for name, module in model.named_modules():
+        if isinstance(module, nn.Dropout):
+            setattr(model, name, nn.Identity())
+    return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch>=2.0.0
+transformers>=4.38.0
+datasets>=2.16.0
+peft>=0.8.2
+accelerate>=0.24.1
+gradio>=4.10.0
+numpy

token_probabilities.json ADDED Viewed

The diff for this file is too large to render. See raw diff