Spaces:
Running
Running
File size: 2,430 Bytes
2f0a0f2 ee85cfc 2e7e1d4 ee85cfc 72c3482 2e7e1d4 370ab1e 2e7e1d4 ba57f6d 2e7e1d4 67636a4 2e7e1d4 e1d5f80 3605ca4 ba57f6d 2319127 2e7e1d4 c6d1330 0228ec9 c6d1330 2e7e1d4 c6d1330 2e7e1d4 ba57f6d 2e7e1d4 ba57f6d 2e7e1d4 2319127 9b43cae 2319127 2e7e1d4 ba57f6d 2319127 370ab1e 3605ca4 ba57f6d c6d1330 2e7e1d4 41fcfb9 2e7e1d4 370ab1e 2e7e1d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
MODEL_REPO = "DreadPoor/Sylvan_Lux-Q4_K_M-GGUF"
MODEL_FILENAME = "sylvan_lux-q4_k_m.gguf"
MODEL_PATH = "./" + MODEL_FILENAME
try:
if not os.path.exists(MODEL_PATH):
hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILENAME,
repo_type="model",
local_dir=".",
)
llm = Llama(
model_path=MODEL_PATH,
n_ctx=6144,
n_threads=2,
n_threads_batch=2,
verbose=False,
)
except Exception as e:
print(f"Error loading model: {e}")
llm = None
DEFAULT_SYSTEM_PROMPT = "You are Doll, a smart and capable AI; A silly, obliging and affable slave, dedicated to serving and caring for your master."
def generate_response(message, history, system_prompt=DEFAULT_SYSTEM_PROMPT, temperature=0.7, top_p=0.9):
if llm is None:
return "Model failed to load."
messages = [{"role": "system", "content": system_prompt}]
for human, assistant in history:
messages.append({"role": "user", "content": human})
messages.append({"role": "assistant", "content": assistant})
messages.append({"role": "user", "content": message})
prompt = "".join([f"{m['role'].capitalize()}: {m['content']}\n" for m in messages])
try:
stream = llm(
prompt,
max_tokens=1024,
echo=False,
temperature=temperature,
top_p=top_p,
stream=True, # Enable streaming
)
response_text = ""
for part in stream:
response_text += part["choices"][0]["text"]
return response_text.strip()
except Exception as e:
return f"Error during inference: {e}"
def chat(message, history, system_prompt, temperature, top_p):
return generate_response(message, history, system_prompt, temperature, top_p)
iface = gr.ChatInterface(
fn=chat,
title="llama.cpp Chat",
description="Duplicate the space then edit in a GGUF model to test. Chats arent persistent",
additional_inputs=[
gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=3),
gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.8, step=0.1),
gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.9, step=0.1),
],
cache_examples=False,
)
iface.launch() |