Spaces:

phanerozoic
/

SchoolSpiritAI

Paused

App Files Files Community

phanerozoic commited on Apr 21

Commit

502a6b6

verified ·

1 Parent(s): ef0a942

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -240

app.py CHANGED Viewed

@@ -1,285 +1,151 @@
-# ────────────────────────────────────────────────────────────────────────────
-#  SchoolSpirit AI Chat – robust edition
-#  ────────────────────────────────────────────────────────────────────────────
-#  • FP‑16 GPU load → CPU float32 fallback
-#  • Streaming responses with retry
-#  • Token‑aware context trimming (keeps within model window)
-#  • One‑time system + welcome message (no duplication)
-#  • Extensive logging
-# ────────────────────────────────────────────────────────────────────────────
-from __future__ import annotations
-import asyncio
-import datetime as _dt
-import os
-import re
-import time
-import traceback
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Dict, List, Tuple
 import gradio as gr
-import torch
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    GenerationConfig,
-    TextIteratorStreamer,
-    pipeline,
-)
 from transformers.utils import logging as hf_logging
-# ────────────────────────────────────────────────────────────────────────────
-#  0.  ENV / LOGGING
-# ────────────────────────────────────────────────────────────────────────────
 os.environ["HF_HOME"] = "/data/.huggingface"
-LOG_FILE = Path("/data/requests.log")
-LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
-def log(line: str) -> None:
-    ts = _dt.datetime.utcnow().strftime("%H:%M:%S.%f")[:-3]
-    entry = f"[{ts}] {line}"
-    print(entry, flush=True)
     try:
-        with LOG_FILE.open("a") as f:
-            f.write(entry + "\n")
-    except Exception:
         pass
-hf_logging.set_verbosity_error()
-# ────────────────────────────────────────────────────────────────────────────
-#  1.  CONFIG
-# ────────────────────────────────────────────────────────────────────────────
-@dataclass
-class Config:
-    MODEL_ID: str = "ibm-granite/granite-3.3-2b-instruct"
-    MAX_MODEL_TOKENS: int = 2048
-    MAX_NEW_TOKENS: int = 64
-    TEMPERATURE: float = 0.6
-    TOP_P: float = 0.9
-    MAX_INPUT_CH: int = 300
-    CONTEXT_MARGIN: int = 128  # leave room for assistant completion
-    STREAMING_CHUNK: float = 0.05  # seconds
-    SYSTEM_PROMPT: str = (
-        "You are **SchoolSpirit AI**, the digital mascot for SchoolSpirit AI LLC, "
-        "founded by Charles Norton in 2025. The company installs on‑prem AI chat "
-        "mascots, offers custom fine‑tuning, and ships turnkey GPU hardware to "
-        "K‑12 schools.\n\n"
-        "GUIDELINES:\n"
-        "• Warm, encouraging tone for students, parents, staff.\n"
-        "• Replies ≤ 4 sentences unless asked for detail.\n"
-        "• If unsure/out‑of‑scope: say so and suggest human follow‑up.\n"
-        "• No personal‑data collection or sensitive advice.\n"
-        "• No profanity, politics, or mature themes."
-    )
-    WELCOME_MSG: str = "Welcome to SchoolSpirit AI! Do you have any questions?"
-CFG = Config()
-# ────────────────────────────────────────────────────────────────────────────
-#  2.  LOAD MODEL (GPU FP‑16 → CPU fallback)
-# ────────────────────────────────────────────────────────────────────────────
-def load_pipeline() -> pipeline:
     log("Loading tokenizer …")
-    tok = AutoTokenizer.from_pretrained(CFG.MODEL_ID)
-    use_gpu = torch.cuda.is_available()
-    dtype = torch.float16 if use_gpu else torch.float32
-    log(f"{'GPU' if use_gpu else 'CPU'} detected → dtype {dtype}")
-    model = AutoModelForCausalLM.from_pretrained(
-        CFG.MODEL_ID,
-        device_map="auto" if use_gpu else "cpu",
-        torch_dtype=dtype,
-        low_cpu_mem_usage=not use_gpu,
-    )
-    gen_cfg = GenerationConfig(
-        max_new_tokens=CFG.MAX_NEW_TOKENS,
-        temperature=CFG.TEMPERATURE,
-        top_p=CFG.TOP_P,
-    )
-    pipe = pipeline(
         "text-generation",
         model=model,
         tokenizer=tok,
-        generation_config=gen_cfg,
     )
-    pipe.tokenizer.padding_side = "left"
-    pipe.tokenizer.pad_token_id = pipe.model.config.eos_token_id
-    log("Model & pipeline loaded ✔")
-    return pipe
-try:
-    PIPE = load_pipeline()
     MODEL_ERR = None
 except Exception as exc:  # noqa: BLE001
-    MODEL_ERR = str(exc)
-    log(f"Model load error: {exc}")
-# ────────────────────────────────────────────────────────────────────────────
-#  3.  HELPER FUNCTIONS
-# ────────────────────────────────────────────────────────────────────────────
-_tokenizer = PIPE.tokenizer if PIPE else None
-_strip = lambda s: re.sub(r"\s+", " ", s.strip())
-def build_prompt(raw: List[Dict[str, str]]) -> str:
-    """
-    raw: [{'role':'system'|'user'|'assistant', 'content': str}, ...]
-    """
-    lines: List[str] = []
-    for m in raw:
-        if m["role"] == "system":
-            lines.append(m["content"])
-        else:
-            prefix = "User" if m["role"] == "user" else "AI"
-            lines.append(f"{prefix}: {m['content']}")
-    lines.append("AI:")
-    return "\n".join(lines)
-def trim_to_window(raw: List[Dict[str, str]]) -> List[Dict[str, str]]:
-    """
-    Trim raw history so total tokens <= model window - margin.
-    Always keep the initial system message.
-    """
-    if not PIPE:
-        return raw
-    max_total = CFG.MAX_MODEL_TOKENS - CFG.CONTEXT_MARGIN
-    while True:
-        toks = len(_tokenizer.encode(build_prompt(raw)))
-        if toks <= max_total or len(raw) <= 2:
-            return raw
-        # Remove second message (first non‑system) then loop
-        raw.pop(1)
-# ────────────────────────────────────────────────────────────────────────────
-#  4.  CHAT HANDLER
-# ────────────────────────────────────────────────────────────────────────────
-async def generate_stream(prompt: str):
-    """
-    Yields partial text chunks for streaming.
-    """
-    streamer = TextIteratorStreamer(
-        PIPE.tokenizer, skip_prompt=True, skip_special_tokens=True
-    )
-    gen_kwargs = dict(prompt, streamer=streamer)
-    loop = asyncio.get_event_loop()
-    task = loop.run_in_executor(None, PIPE.model.generate, **gen_kwargs)
-    # Stream chunks
-    async for token in streamer:
-        yield token
-        await asyncio.sleep(CFG.STREAMING_CHUNK)
-    await task  # ensure generation done
-def respond(
-    user_msg: str, chat_hist: List[Tuple[str, str]], state: Dict[str, Any]
-) -> Tuple[List[Tuple[str, str]], Dict[str, Any]]:
     """
-    Gradio synchronous wrapper that kicks off async generation.
     """
     if MODEL_ERR:
-        chat_hist.append((user_msg, MODEL_ERR))
-        return chat_hist, state
-    user_msg = _strip(user_msg or "")
     if not user_msg:
-        chat_hist.append((user_msg, "Please type something."))
-        return chat_hist, state
-    if len(user_msg) > CFG.MAX_INPUT_CH:
-        chat_hist.append(
-            (user_msg, f"Message too long (>{CFG.MAX_INPUT_CH} chars).")
-        )
-        return chat_hist, state
-    raw = state["raw"]
-    raw.append({"role": "user", "content": user_msg})
-    raw = trim_to_window(raw)
-    prompt = build_prompt(raw)
-    # Streaming generation
-    streamer = TextIteratorStreamer(
-        PIPE.tokenizer, skip_prompt=True, skip_special_tokens=True
-    )
-    gen_task = PIPE.model.generate(
-        PIPE.tokenizer(prompt, return_tensors="pt").to(PIPE.model.device)["input_ids"],
-        streamer=streamer,
-        max_new_tokens=CFG.MAX_NEW_TOKENS,
-        temperature=CFG.TEMPERATURE,
-        top_p=CFG.TOP_P,
     )
-    reply = ""
-    for token in streamer:
-        reply += token
-        chat_hist[-1] = (user_msg, reply)
-        yield chat_hist, state
-    raw.append({"role": "assistant", "content": reply})
-    state["raw"] = raw
-    yield chat_hist, state
-# ────────────────────────────────────────────────────────────────────────────
-#  5.  LAUNCH UI (Gradio Blocks)
-# ────────────────────────────────────────────────────────────────────────────
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
-    gr.Markdown("# 🏫 SchoolSpirit AI Chat")
     chatbot = gr.Chatbot(
-        value=[("", CFG.WELCOME_MSG)],
-        height=480,
-        label="SchoolSpirit AI",
-    )
-    state = gr.State(
-        {"raw": [{"role": "system", "content": CFG.SYSTEM_PROMPT}]}
     )
     with gr.Row():
         txt = gr.Textbox(
-            scale=4,
-            placeholder="Ask me anything about SchoolSpirit AI …",
-            show_label=False,
         )
         send = gr.Button("Send", variant="primary")
-    # Bind both button click and ENTER keypress
-    for trigger in (send, txt):
-        trigger.click(
-            respond,
-            inputs=[txt, chatbot, state],
-            outputs=[chatbot, state],
-        ).then(
-            lambda: "",
-            None,
-            txt,
-        )  # clear textbox
-    demo.load(lambda: None)  # dummy to ensure Blocks builds
-# ---------------------------------------------------------------------------
-# Graceful shutdown (for HF Space restarts)
-# ---------------------------------------------------------------------------
-def _shutdown(*_):
-    log("Space shutting down …")
-import atexit, signal  # noqa: E402
-atexit.register(_shutdown)
-signal.signal(signal.SIGTERM, lambda *_: _shutdown())
-signal.signal(signal.SIGINT, lambda *_: _shutdown())
 demo.launch()

+import os, re, time, datetime, traceback, torch
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from transformers.utils import logging as hf_logging
+# -------------------------------------------------------------------
+# 1.  Logging helpers
+# -------------------------------------------------------------------
 os.environ["HF_HOME"] = "/data/.huggingface"
+LOG_FILE = "/data/requests.log"
+def log(msg: str):
+    ts = datetime.datetime.utcnow().strftime("%H:%M:%S.%f")[:-3]
+    line = f"[{ts}] {msg}"
+    print(line, flush=True)
     try:
+        with open(LOG_FILE, "a") as f:
+            f.write(line + "\n")
+    except FileNotFoundError:
         pass
+# -------------------------------------------------------------------
+# 2.  Configuration
+# -------------------------------------------------------------------
+MODEL_ID = "ibm-granite/granite-3.3-2b-instruct"
+MAX_TURNS, MAX_TOKENS, MAX_INPUT_CH = 4, 64, 300
+SYSTEM_MSG = (
+    "You are **SchoolSpirit AI**, the digital mascot for SchoolSpirit AI LLC, "
+    "founded by Charles Norton in 2025. The company installs on‑prem AI chat "
+    "mascots, offers custom fine‑tuning, and ships turnkey GPU hardware to "
+    "K‑12 schools.\n\n"
+    "GUIDELINES:\n"
+    "• Warm, encouraging tone for students, parents, staff.\n"
+    "• Replies ≤ 4 sentences unless asked for detail.\n"
+    "• If unsure/out‑of‑scope: say so and suggest human follow‑up.\n"
+    "• No personal‑data collection or sensitive advice.\n"
+    "• No profanity, politics, or mature themes."
+)
+WELCOME_MSG = "Welcome to SchoolSpirit AI! Do you have any questions?"
+def strip(s: str) -> str:
+    return re.sub(r"\s+", " ", s.strip())
+# -------------------------------------------------------------------
+# 3.  Load model (GPU FP‑16 → CPU fallback)
+# -------------------------------------------------------------------
+hf_logging.set_verbosity_error()
+try:
     log("Loading tokenizer …")
+    tok = AutoTokenizer.from_pretrained(MODEL_ID)
+    if torch.cuda.is_available():
+        log("GPU detected → FP‑16")
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID, device_map="auto", torch_dtype=torch.float16
+        )
+    else:
+        log("CPU fallback")
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID, device_map="cpu", torch_dtype="auto", low_cpu_mem_usage=True
+        )
+    gen = pipeline(
         "text-generation",
         model=model,
         tokenizer=tok,
+        max_new_tokens=MAX_TOKENS,
+        do_sample=True,
+        temperature=0.6,
     )
     MODEL_ERR = None
+    log("Model loaded ✔")
 except Exception as exc:  # noqa: BLE001
+    MODEL_ERR, gen = f"Model load error: {exc}", None
+    log(MODEL_ERR)
+# -------------------------------------------------------------------
+# 4.  Chat callback
+# -------------------------------------------------------------------
+def chat_fn(user_msg: str, history: list[tuple[str, str]], state: dict):
     """
+    history: list of (user, assistant) tuples (Gradio default)
+    state  : dict carrying system_prompt + raw_history for the model
+    Returns updated history (for UI) and state (for next round)
     """
     if MODEL_ERR:
+        return history + [(user_msg, MODEL_ERR)], state
+    user_msg = strip(user_msg or "")
     if not user_msg:
+        return history + [(user_msg, "Please type something.")], state
+    if len(user_msg) > MAX_INPUT_CH:
+        warn = f"Message too long (>{MAX_INPUT_CH} chars)."
+        return history + [(user_msg, warn)], state
+    # ------------------------------------------------ Prompt assembly
+    raw_hist = state.get("raw", [])
+    raw_hist.append({"role": "user", "content": user_msg})
+    # keep system + last N exchanges
+    convo = [m for m in raw_hist if m["role"] != "system"][-MAX_TURNS * 2 :]
+    raw_hist = [{"role": "system", "content": SYSTEM_MSG}] + convo
+    prompt = "\n".join(
+        [
+            m["content"]
+            if m["role"] == "system"
+            else f'{"User" if m["role"]=="user" else "AI"}: {m["content"]}'
+            for m in raw_hist
+        ]
+        + ["AI:"]
     )
+    try:
+        raw = gen(prompt)[0]["generated_text"]
+        reply = strip(raw.split("AI:", 1)[-1])
+        reply = re.split(r"\b(?:User:|AI:)", reply, 1)[0].strip()
+    except Exception:
+        log("❌ Inference error:\n" + traceback.format_exc())
+        reply = "Sorry—backend crashed. Please try again later."
+    # ------------------------------------------------ Update state + UI history
+    raw_hist.append({"role": "assistant", "content": reply})
+    state["raw"] = raw_hist
+    history.append((user_msg, reply))
+    return history, state
+# -------------------------------------------------------------------
+# 5.  Launch
+# -------------------------------------------------------------------
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
     chatbot = gr.Chatbot(
+        value=[("", WELCOME_MSG)], height=480, label="SchoolSpirit AI"
     )
+    state = gr.State({"raw": [{"role": "system", "content": SYSTEM_MSG}]})
     with gr.Row():
         txt = gr.Textbox(
+            scale=4, placeholder="Type your question here...", show_label=False
         )
         send = gr.Button("Send", variant="primary")
+    send.click(chat_fn, inputs=[txt, chatbot, state], outputs=[chatbot, state])
+    txt.submit(chat_fn, inputs=[txt, chatbot, state], outputs=[chatbot, state])
 demo.launch()