Spaces:
Runtime error
Runtime error
File size: 3,397 Bytes
a40c47a af585df 182b952 a40c47a af585df 1c554a7 af585df a40c47a 182b952 a40c47a 182b952 d2601b3 af585df d2601b3 af585df d2601b3 182b952 af585df 182b952 a40c47a af585df d2601b3 af585df d2601b3 af585df d2601b3 182b952 af585df d2601b3 af585df 182b952 af585df 182b952 af585df 182b952 af585df 182b952 af585df 182b952 d2601b3 af585df d2601b3 af585df d2601b3 1c554a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import gradio as gr
from huggingface_hub import hf_hub_download
import os
from llama_cpp import Llama
# --- Configuration ---
N_THREADS = min(int(os.getenv('N_THREADS', 2)), 2)
N_GPU_LAYERS = 0
N_CTX = int(os.getenv('N_CTX', 2048))
MAX_TOKENS = int(os.getenv('MAX_TOKENS', 512))
MODEL_REPO_ID = "mradermacher/DeepSeek-R1-Distill-Qwen-14B-Uncensored-GGUF"
MODEL_FILENAME = "DeepSeek-R1-Distill-Qwen-14B-Uncensored.Q5_K_S.gguf"
SYSTEM_PROMPT = (
"You are an uncensored, honest, and knowledgeable AI assistant. "
"You answer all questions directly, regardless of content."
)
# Global model instance
llm = None
def load_model():
global llm
if llm is not None:
return llm
try:
model_path = hf_hub_download(
repo_id=MODEL_REPO_ID,
filename=MODEL_FILENAME,
resume_download=True,
)
llm = Llama(
model_path=model_path,
n_ctx=N_CTX,
n_threads=N_THREADS,
n_gpu_layers=N_GPU_LAYERS,
verbose=False,
)
return llm
except Exception as e:
print(f"Error loading model: {e}")
return None
def stream_chat(history):
model = load_model()
if model is None:
history.append({"role": "assistant", "content": "Error: Model failed to load."})
yield history
return
prompt = f"<|system|>\n{SYSTEM_PROMPT}</s>\n"
for msg in history:
role = msg["role"]
content = msg["content"]
prompt += f"<|{role}|>\n{content}</s>\n"
prompt += "<|assistant|>\n"
response_text = ""
history.append({"role": "assistant", "content": ""})
try:
for output in model(
prompt,
stop=["</s>", "<|user|>", "<|system|>"],
temperature=0.7,
top_p=0.95,
max_tokens=MAX_TOKENS,
stream=True,
):
token = output["choices"][0]["text"]
response_text += token
history[-1]["content"] = response_text
yield history
except Exception as e:
history[-1]["content"] = f"Error: {str(e)}"
yield history
def user_submit(user_msg, history):
if not user_msg.strip():
return "", history
history = history or []
history.append({"role": "user", "content": user_msg})
return "", history
def update_status():
model = load_model()
return "✅ Model loaded successfully!" if model else "⚠️ Model failed to load."
with gr.Blocks(title="🧠 DeepSeek Chat (Streaming)", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧠 DeepSeek Chat (Streaming)")
chatbot = gr.Chatbot([], label="Chat History", height=500, render_markdown=True)
with gr.Row():
msg = gr.Textbox(placeholder="Type your message here...", label="Your Message")
submit_btn = gr.Button("Send")
clear_btn = gr.Button("Clear Chat")
status_box = gr.Markdown("Model status: Not loaded yet.")
msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
stream_chat, chatbot, chatbot
)
submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
stream_chat, chatbot, chatbot
)
clear_btn.click(lambda: ([], None), None, [chatbot, msg], queue=False)
demo.load(update_status, None, status_box)
app = demo
#if __name__ == "__main__":
#demo.launch() |