GGUF_CPU_Test_bench

Running

App Files Files Community

GGUF_CPU_Test_bench / app.py

DreadPoor

Update app.py

72c3482 verified about 1 month ago

raw

history blame contribute delete

2.43 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import os

	MODEL_REPO = "DreadPoor/Sylvan_Lux-Q4_K_M-GGUF"
	MODEL_FILENAME = "sylvan_lux-q4_k_m.gguf"
	MODEL_PATH = "./" + MODEL_FILENAME

	try:
	if not os.path.exists(MODEL_PATH):
	hf_hub_download(
	repo_id=MODEL_REPO,
	filename=MODEL_FILENAME,
	repo_type="model",
	local_dir=".",
	)
	llm = Llama(
	model_path=MODEL_PATH,
	n_ctx=6144,
	n_threads=2,
	n_threads_batch=2,
	verbose=False,
	)
	except Exception as e:
	print(f"Error loading model: {e}")
	llm = None

	DEFAULT_SYSTEM_PROMPT = "You are Doll, a smart and capable AI; A silly, obliging and affable slave, dedicated to serving and caring for your master."

	def generate_response(message, history, system_prompt=DEFAULT_SYSTEM_PROMPT, temperature=0.7, top_p=0.9):
	if llm is None:
	return "Model failed to load."

	messages = [{"role": "system", "content": system_prompt}]
	for human, assistant in history:
	messages.append({"role": "user", "content": human})
	messages.append({"role": "assistant", "content": assistant})
	messages.append({"role": "user", "content": message})

	prompt = "".join([f"{m['role'].capitalize()}: {m['content']}\n" for m in messages])

	try:
	stream = llm(
	prompt,
	max_tokens=1024,
	echo=False,
	temperature=temperature,
	top_p=top_p,
	stream=True, # Enable streaming
	)

	response_text = ""
	for part in stream:
	response_text += part["choices"][0]["text"]

	return response_text.strip()

	except Exception as e:
	return f"Error during inference: {e}"

	def chat(message, history, system_prompt, temperature, top_p):
	return generate_response(message, history, system_prompt, temperature, top_p)

	iface = gr.ChatInterface(
	fn=chat,
	title="llama.cpp Chat",
	description="Duplicate the space then edit in a GGUF model to test. Chats arent persistent",
	additional_inputs=[
	gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=3),
	gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.8, step=0.1),
	gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.9, step=0.1),
	],
	cache_examples=False,
	)

	iface.launch()