GGUF_CPU_Test_bench

Running

App Files Files Community

Dread2Poor commited on Apr 2

Commit

ee85cfc

verified ·

1 Parent(s): fa6cf2a

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -76

app.py CHANGED Viewed

@@ -1,77 +1,61 @@
-from llama_cpp import Llama
-from huggingface_hub import hf_hub_download
 import gradio as gr
-hf_hub_download(repo_id="Dread2Poor/Killamanjaro-8B-Model_Stock-Q4_K_M-GGUF", filename="killamanjaro-8b-model_stock-q4_k_m.gguf", repo_type="model", local_dir=".")
-# Initialize the model
-llm = Llama(model_path="./killamanjaro-8b-model_stock-q4_k_m.gguf", n_threads=2)
-# System prompt
-system_prompt = "You are my mother. You are caring, obliging and affable."
-# Chat history
-history = [{"role": "system", "content": system_prompt}]
-def format_prompt(history):
-    """
-    Format the chat history into a prompt for the model.
-    """
-    prompt = ""
-    for turn in history:
-        if turn["role"] == "system":
-            prompt += f"<|system|>\n{turn['content']}\n<|end|>\n"
-        elif turn["role"] == "user":
-            prompt += f"<|user|>\n{turn['content']}\n<|end|>\n"
-        elif turn["role"] == "assistant":
-            prompt += f"<|assistant|>\n{turn['content']}\n<|end|>\n"
-    prompt += "<|assistant|>\n"  # Model expects the assistant's reply next
-    return prompt
-def chat_with_model(user_input):
-    """
-    Engage in a chat with the model using the provided user input.
-    """
-    global history
-    # Append user input to history
-    history.append({"role": "user", "content": user_input})
-    # Format the prompt with history
-    prompt = format_prompt(history)
-    # Generate response
-    response = llm(prompt, max_tokens=150, stop=["<|end|>"])
-    # Extract assistant's reply
-    assistant_reply = response['choices'][0]['text'].strip()
-    # Append assistant's reply to history
-    history.append({"role": "assistant", "content": assistant_reply})
-    return assistant_reply
-def reset_history():
-    """
-    Reset the chat history to the initial state.
-    """
-    global history
-    history = [{"role": "system", "content": system_prompt}]
-    return gr.update(value=""), gr.update(value=[])
-# Create Gradio interface
-with gr.Blocks() as demo:
-    chatbot = gr.Chatbot()
-    with gr.Row():
-        with gr.Column(scale=4):
-            user_input = gr.Textbox(show_label=False, placeholder="Type your message here...")
-        with gr.Column(scale=1):
-            submit_btn = gr.Button("Send")
-    with gr.Row():
-        clear_btn = gr.Button("Clear Chat")
-    submit_btn.click(chat_with_model, inputs=user_input, outputs=chatbot)
-    clear_btn.click(reset_history, inputs=[], outputs=[user_input, chatbot])
-# Launch the interface
-demo.launch()

 import gradio as gr
+from llama_cpp import Llama
+import os
+import requests
+import tempfile
+MODEL_PATH = "model.gguf"  # Default model name
+MODEL_URL = None # Optional URL to download the model from
+def download_model(url, save_path):
+    """Downloads a file from a URL."""
+    response = requests.get(url, stream=True)
+    response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
+    with open(save_path, "wb") as file:
+        for chunk in response.iter_content(chunk_size=8192):
+            file.write(chunk)
+def load_model(model_path):
+    """Loads a GGUF model using llama-cpp-python."""
+    try:
+        llm = Llama(model_path)
+        return llm
+    except Exception as e:
+        return f"Error loading model: {e}"
+def generate_response(prompt, model):
+    """Generates a response using the loaded model."""
+    if isinstance(model, str): # if there was an error loading the model
+        return model
+    try:
+        output = model(prompt, max_tokens=256) # Adjust max_tokens as needed
+        return output["choices"][0]["text"]
+    except Exception as e:
+        return f"Error generating response: {e}"
+# Download model if URL is provided and model doesn't exist
+if MODEL_URL and not os.path.exists(MODEL_PATH):
+    print(f"Downloading model from {MODEL_URL}...")
+    try:
+        download_model(MODEL_URL, MODEL_PATH)
+        print("Model downloaded successfully.")
+    except Exception as e:
+        print(f"Error downloading model: {e}")
+# Load the model
+llm = load_model(MODEL_PATH)
+def inference(prompt):
+    return generate_response(prompt, llm)
+# Gradio Interface
+iface = gr.Interface(
+    fn=inference,
+    inputs=gr.Textbox(lines=5, placeholder="Enter your prompt here..."),
+    outputs=gr.Textbox(lines=10),
+    title="llama.cpp CPU Inference",
+    description="Generate text using a GGUF model with llama.cpp on CPU.",
+)
+iface.launch()