File size: 1,884 Bytes
1e2d981 6da1c26 1e2d981 637f425 c2aa89c 637f425 1cd873c c2aa89c 1cd873c c2aa89c 1e2d981 68eded2 1e2d981 1cd873c 1e2d981 1cd873c 1e2d981 1cd873c 1e2d981 8a8d916 1e2d981 02743b6 6da1c26 02743b6 1cd873c 1e2d981 6da1c26 1e2d981 8a8d916 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import gradio as gr
from llama_cpp import Llama
model = "Qwen/Qwen2-7B-Instruct-GGUF"
llm = Llama.from_pretrained(
repo_id=model,
filename="qwen2-7b-instruct-q4_k_m.gguf",
verbose=True,
use_mmap=False,
use_mlock=True,
n_threads=2,
n_threads_batch=2,
n_ctx=8000,
)
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
# response = llm.create_chat_completion(
# messages=messages,
# max_tokens=max_tokens,
# temperature=temperature,
# top_p=top_p,
# )
# return response["choices"][0]["message"]["content"]
response = ""
for message in llm.create_chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
token = message.choices[0].delta.content
response += token
yield response
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(
value="You are a helpful assistant.",
label="System message",
),
gr.Slider(minimum=1, maximum=2048, value=1024, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
description=model,
)
if __name__ == "__main__":
demo.launch()
|