cedpsam's picture
Create app.py
65ecc4c
raw
history blame
3.22 kB
import gradio as gr
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from huggingface_hub import hf_hub_download
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
repo_id="TheBloke/Mistral-7B-OpenOrca-GGUF"
model_name="mistral-7b-openorca.Q5_K_M.gguf"
hf_hub_download(repo_id=repo_id,
filename=model_name,local_dir =".")
def format_prompt(message, history):
prompt = "<s>"
for user_prompt, bot_response in history:
prompt += f"[INST] {user_prompt} [/INST]"
prompt += f" {bot_response}</s> "
prompt += f"[INST] {message} [/INST]"
return prompt
def generate(
prompt, history, temperature=0.9, top_p=0.95,
):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
formatted_prompt = format_prompt(prompt, history)
llm = LlamaCpp(
model_path=model_name,
temperature=temperature,
max_tokens=2000,
top_p=top_p,
callback_manager=callback_manager,
verbose=True, # Verbose is required to pass to the callback manager
)
# stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
output=llm(formatted_prompt)
# for response in stream:
# output += response.token.text
# yield output
return output
additional_inputs=[
gr.Slider(
label="Temperature",
value=0.9,
minimum=0.0,
maximum=1.0,
step=0.05,
interactive=True,
info="Higher values produce more diverse outputs",
),
gr.Slider(
label="Max new tokens",
value=256,
minimum=0,
maximum=1048,
step=64,
interactive=True,
info="The maximum numbers of new tokens",
),
gr.Slider(
label="Top-p (nucleus sampling)",
value=0.90,
minimum=0.0,
maximum=1,
step=0.05,
interactive=True,
info="Higher values sample more low-probability tokens",
),
gr.Slider(
label="Repetition penalty",
value=1.2,
minimum=1.0,
maximum=2.0,
step=0.05,
interactive=True,
info="Penalize repeated tokens",
)
]
css = """
#mkd {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.HTML("<h1><center>Mistral 7B Instruct<h1><center>")
gr.HTML("<h3><center>In this demo, you can chat with <a href='https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1'>Mistral-7B-Instruct</a> model. πŸ’¬<h3><center>")
gr.HTML("<h3><center>Learn more about the model <a href='https://huggingface.co/docs/transformers/main/model_doc/mistral'>here</a>. πŸ“š<h3><center>")
gr.ChatInterface(
generate,
additional_inputs=additional_inputs,
examples=[["What is the secret to life?"], ["Write me a recipe for pancakes."]]
)
demo.queue().launch(debug=True)