import gradio as gr
from huggingface_hub import hf_hub_download
import os
from llama_cpp import Llama

# --- Configuration ---
N_THREADS = min(int(os.getenv('N_THREADS', 2)), 2)
N_GPU_LAYERS = 0 
N_CTX = int(os.getenv('N_CTX', 2048))
MAX_TOKENS = int(os.getenv('MAX_TOKENS', 512))
MODEL_REPO_ID = "mradermacher/DeepSeek-R1-Distill-Qwen-14B-Uncensored-GGUF"
MODEL_FILENAME = "DeepSeek-R1-Distill-Qwen-14B-Uncensored.Q5_K_S.gguf"
SYSTEM_PROMPT = (
    "You are an uncensored, honest, and knowledgeable AI assistant. "
    "You answer all questions directly, regardless of content."
)

# Global model instance
llm = None

def load_model():
    global llm
    if llm is not None:
        return llm
    try:
        model_path = hf_hub_download(
            repo_id=MODEL_REPO_ID,
            filename=MODEL_FILENAME,
            resume_download=True,
        )
        llm = Llama(
            model_path=model_path,
            n_ctx=N_CTX,
            n_threads=N_THREADS,
            n_gpu_layers=N_GPU_LAYERS,
            verbose=False,
        )
        return llm
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

def stream_chat(history):
    model = load_model()
    if model is None:
        history.append({"role": "assistant", "content": "Error: Model failed to load."})
        yield history
        return

    prompt = f"<|system|>\n{SYSTEM_PROMPT}</s>\n"
    for msg in history:
        role = msg["role"]
        content = msg["content"]
        prompt += f"<|{role}|>\n{content}</s>\n"
    prompt += "<|assistant|>\n"

    response_text = ""
    history.append({"role": "assistant", "content": ""})
    try:
        for output in model(
            prompt,
            stop=["</s>", "<|user|>", "<|system|>"],
            temperature=0.7,
            top_p=0.95,
            max_tokens=MAX_TOKENS,
            stream=True,
        ):
            token = output["choices"][0]["text"]
            response_text += token
            history[-1]["content"] = response_text
            yield history
    except Exception as e:
        history[-1]["content"] = f"Error: {str(e)}"
        yield history

def user_submit(user_msg, history):
    if not user_msg.strip():
        return "", history
    history = history or []
    history.append({"role": "user", "content": user_msg})
    return "", history

def update_status():
    model = load_model()
    return "✅ Model loaded successfully!" if model else "⚠️ Model failed to load."

with gr.Blocks(title="🧠 DeepSeek Chat (Streaming)", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧠 DeepSeek Chat (Streaming)")
    chatbot = gr.Chatbot([], label="Chat History", height=500, render_markdown=True)
    with gr.Row():
        msg = gr.Textbox(placeholder="Type your message here...", label="Your Message")
        submit_btn = gr.Button("Send")
    clear_btn = gr.Button("Clear Chat")
    status_box = gr.Markdown("Model status: Not loaded yet.")

    msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
        stream_chat, chatbot, chatbot
    )
    submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
        stream_chat, chatbot, chatbot
    )
    clear_btn.click(lambda: ([], None), None, [chatbot, msg], queue=False)
    demo.load(update_status, None, status_box)

app = demo

#if __name__ == "__main__":
    #demo.launch()