Spaces:
Running
Running
File size: 6,404 Bytes
2f0a0f2 ee85cfc 370ab1e ee85cfc 370ab1e ee85cfc 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 27be339 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e ba57f6d 370ab1e e1d5f80 3605ca4 370ab1e ba57f6d 370ab1e c6d1330 0228ec9 c6d1330 ba57f6d 370ab1e ba57f6d 370ab1e ba57f6d 370ab1e 3605ca4 370ab1e ba57f6d c6d1330 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 370ab1e 3605ca4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download, HfApi
import os
import sys
import time
import requests
from tqdm import tqdm # For progress bars
MODEL_PATH = "./"
llm = None
api = HfApi()
DEFAULT_SYSTEM_PROMPT = "You are Doll, a smart and capable AI; A silly, obliging and affable slave, dedicated to serving and caring for your master."
def download_file(url, local_filename):
"""Downloads a file from a URL with a progress bar."""
try:
with requests.get(url, stream=True) as r:
r.raise_for_status()
total_length = int(r.headers.get("content-length"))
with open(local_filename, "wb") as f:
with tqdm(total=total_length, unit="B", unit_scale=True, desc=local_filename) as pbar:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
pbar.update(len(chunk))
return True
except Exception as e:
print(f"Error downloading {url}: {e}")
return False
def find_quantized_model_url(repo_url, quant_type="Q4_K_M"):
"""
Finds the URL of a specific quantized GGUF model file within a Hugging Face repository.
"""
try:
repo_id = repo_url.replace("https://huggingface.co/", "")
files = api.list_repo_files(repo_id=repo_id, repo_type="model")
for file_info in files:
if file_info.name.endswith(".gguf") and quant_type.lower() in file_info.name.lower():
model_url = f"https://huggingface.co/{repo_id}/resolve/main/{file_info.name}"
print(f"Found quantized model URL: {model_url}")
return model_url
print(f"Quantized model with type {quant_type} not found in repository {repo_url}")
return None
except Exception as e:
print(f"Error finding quantized model: {e}")
return None
def load_model(repo_url=None, quant_type="Q4_K_M"):
"""Loads the Llama model, downloading the specified quantized version from a repository."""
global llm
global MODEL_PATH
try:
if repo_url:
model_url = find_quantized_model_url(repo_url, quant_type)
if model_url is None:
return f"Quantized model ({quant_type}) not found in the repository."
print(f"Downloading model from {model_url}...")
downloaded_model_name = os.path.basename(model_url)
download_success = download_file(model_url, downloaded_model_name)
if not download_success:
return "Model download failed."
model_path = downloaded_model_name
else:
model_path = MODEL_PATH + MODEL_FILENAME
if not os.path.exists(model_path):
if not repo_url: # only try to download if a repo_url was not provided
hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILENAME,
repo_type="model",
local_dir=".",
)
if not os.path.exists(model_path): # check again after attempting download
return f"Model file not found at {model_path}."
print(f"Loading model from {model_path}...")
llm = Llama(
model_path=model_path,
n_ctx=4096,
n_threads=2,
n_threads_batch=2,
verbose=False,
)
print("Model loaded successfully.")
return "Model loaded successfully."
except Exception as e:
error_message = f"Error loading model: {e}"
print(error_message)
llm = None
return error_message
def generate_response(message, history, system_prompt=DEFAULT_SYSTEM_PROMPT, temperature=0.7, top_p=0.9):
"""Generates a response from the Llama model."""
if llm is None:
yield "Model failed to load. Please check the console for error messages."
return
messages = [{"role": "system", "content": system_prompt}]
for human, assistant in history:
messages.append({"role": "user", "content": human})
messages.append({"role": "assistant", "content": assistant})
messages.append({"role": "user", "content": message})
prompt = "".join([f"{m['role'].capitalize()}: {m['content']}\n" for m in messages])
try:
for chunk in llm.create_completion(
prompt,
max_tokens=1024,
echo=False,
temperature=temperature,
top_p=top_p,
stream=True,
):
text = chunk["choices"][0]["text"]
yield text
except Exception as e:
error_message = f"Error during inference: {e}"
print(error_message)
yield error_message
def chat(message, history, system_prompt, temperature, top_p):
"""Wrapper function for the chat interface."""
return generate_response(message, history, system_prompt, temperature, top_p)
def main():
"""Main function to load the model and launch the Gradio interface."""
def load_model_and_launch(repo_url, quant_type):
model_load_message = load_model(repo_url, quant_type)
return model_load_message
with gr.Blocks() as iface:
gr.Markdown("## llama.cpp Chat")
status_label = gr.Label(label="Model Loading Status")
repo_url_input = gr.Textbox(label="Repository URL", placeholder="Enter repository URL")
quant_type_input = gr.Dropdown(
label="Quantization Type",
choices=["Q4_K_M", "Q6", "Q4_K_S"],
value="Q4_K_M",
)
load_button = gr.Button("Load Model")
chat_interface = gr.ChatInterface(
fn=chat,
description="Test a GGUF model. Chats aren't persistent.",
additional_inputs=[
gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=3),
gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.8, step=0.1),
gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.9, step=0.1),
],
cache_examples=False,
)
load_button.click(
load_model_and_launch,
inputs=[repo_url_input, quant_type_input],
outputs=status_label,
)
iface.launch()
if __name__ == "__main__":
main()
|