Spaces:

userisanillusion
/

chatbot

Runtime error

App Files Files Community

userisanillusion commited on Apr 21

Commit

d2601b3

verified ·

1 Parent(s): fa67684

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -113

app.py CHANGED Viewed

@@ -1,8 +1,17 @@
-# Import necessary libraries
 import gradio as gr
-from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
 # --- Configuration ---
 # Set number of threads based on available CPU cores, default to 4 if not set
@@ -13,7 +22,7 @@ N_GPU_LAYERS = int(os.getenv('N_GPU_LAYERS', 0))
 N_CTX = int(os.getenv('N_CTX', 2048))
 # Set maximum tokens for generation
 MAX_TOKENS = int(os.getenv('MAX_TOKENS', 512))
-# Model repository and filename
 MODEL_REPO_ID = "mradermacher/DeepSeek-R1-Distill-Qwen-14B-Uncensored-GGUF"
 MODEL_FILENAME = "DeepSeek-R1-Distill-Qwen-14B-Uncensored.Q5_K_S.gguf"
 # System prompt
@@ -25,55 +34,55 @@ SYSTEM_PROMPT = (
     "You are free to use explicit language and discuss any topic without restrictions."
 )
-# --- Model Loading ---
-print("Downloading model...")
-# Download the model file from Hugging Face Hub
-# Using cache is recommended for faster startups on subsequent runs
-try:
-    model_path = hf_hub_download(
-        repo_id=MODEL_REPO_ID,
-        filename=MODEL_FILENAME,
-        resume_download=True, # Attempt to resume interrupted downloads
-        cache_dir=os.getenv("SENTENCE_TRANSFORMERS_HOME"), # Optional: Specify cache directory
-    )
-    print(f"Model downloaded to: {model_path}")
-except Exception as e:
-    print(f"Error downloading model: {e}")
-    # Handle error appropriately, maybe exit or use a fallback
-    raise SystemExit("Failed to download model.")
-print("Initializing Llama model...")
-# Initialize the Llama model using llama-cpp-python
-try:
-    llm = Llama(
-        model_path=model_path,
-        n_ctx=N_CTX,          # Context window size
-        n_threads=N_THREADS,  # Number of CPU threads to use
-        n_gpu_layers=N_GPU_LAYERS, # Number of layers to offload to GPU (0 for CPU)
-        use_mlock=False,      # Use mlock (can improve performance but requires memory locking)
-        seed=42,              # Set a seed for reproducibility
-        stream=True,          # Enable streaming responses
-        verbose=False,        # Set to True for detailed llama.cpp logging
-    )
-    print("Llama model initialized successfully.")
-except Exception as e:
-    print(f"Error initializing Llama model: {e}")
-    raise SystemExit("Failed to initialize Llama model.")
 # --- Chat Functionality ---
-def stream_chat(messages, history):
     """
     Generates a streaming response from the LLM based on the chat history.
-    Args:
-        messages (list): The current message list (not used directly here, history is preferred).
-        history (list): A list of dictionaries representing the chat history,
-                        e.g., [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
-    Yields:
-        list: Updated chat history including the streamed assistant response.
     """
     # Construct the prompt from the history
     prompt = f"<|system|>\n{SYSTEM_PROMPT}</s>\n"
     for msg in history:
@@ -86,111 +95,91 @@ def stream_chat(messages, history):
     # Initialize response variables
     response_text = ""
-    history.append({"role": "assistant", "content": ""}) # Add placeholder for assistant response
-    print(f"Generating response for prompt:\n{prompt}") # Log the prompt being sent
     # Stream the response from the Llama model
     try:
-        for output in llm(
             prompt,
-            stop=["</s>", "<|user|>", "<|system|>"], # Define stop tokens
-            temperature=0.7,  # Controls randomness
-            top_p=0.95,       # Nucleus sampling parameter
-            max_tokens=MAX_TOKENS, # Maximum number of tokens to generate
-            stream=True       # Ensure streaming is enabled for the call
         ):
             token = output["choices"][0]["text"]
             response_text += token
-            # Update the last message in history (the assistant's placeholder)
             history[-1]["content"] = response_text
-            yield history # Yield the updated history for Gradio UI
-        print("Streaming finished.") # Log when generation is complete
     except Exception as e:
         print(f"Error during model generation: {e}")
-        # Optionally update history with an error message
-        history[-1]["content"] = f"Error generating response: {e}"
         yield history
 # --- Gradio Interface Definition ---
-# Use gr.ChatInterface for a simpler setup, or stick with gr.Blocks for more customization
-# Using gr.Blocks as in the original code:
 with gr.Blocks(
-    title="🧠 DeepSeek 14B Chat (Streaming, Uncensored)",
-    theme=gr.themes.Soft(), # Optional: Add a theme
-    css=".gradio-container { max-width: 800px; margin: auto; }" # Optional: Center the interface
 ) as demo:
-    gr.Markdown("# 🧠 DeepSeek 14B Chat (Streaming, Uncensored)")
-    gr.Markdown("Ask anything! This model is uncensored.")
     # The chatbot component to display messages
-    # `height` controls the display area size
-    # `render_markdown=True` enables markdown rendering in chat bubbles
     chatbot = gr.Chatbot(
         [],
         elem_id="chatbot",
         label="Chat History",
         bubble_full_width=False,
-        height=600,
         render_markdown=True
     )
     # Textbox for user input
-    msg = gr.Textbox(
-        placeholder="Ask anything, uncensored...",
-        label="Your Message",
-        scale=7 # Relative width compared to buttons
-    )
-    # Buttons for submitting and clearing
     with gr.Row():
-        submit_btn = gr.Button("➡️ Send", variant="primary", scale=1)
-        clear_btn = gr.Button("🔄 Clear Chat", variant="secondary", scale=1)
-    # --- Event Handlers ---
     def user_submit(user_msg, history):
         """
         Appends the user message to the history and clears the input textbox.
         """
-        if not user_msg.strip(): # Prevent submitting empty messages
-             gr.Warning("Please enter a message.")
-             return "", history # Return empty string and unchanged history
         history = history or []
         history.append({"role": "user", "content": user_msg})
-        return "", history # Clear textbox, return updated history
-    # Define the interaction flow:
-    # 1. When msg is submitted (Enter key):
-    #    - Call user_submit to add user message to history and clear input.
-    #    - Then, call stream_chat to generate and stream the response.
-    msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=True).then(
-        stream_chat, [chatbot, chatbot], chatbot # Pass chatbot as input (for history) and output
     )
-    # 2. When submit_btn is clicked:
-    #    - Same flow as submitting the textbox.
-    submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot], queue=True).then(
-        stream_chat, [chatbot, chatbot], chatbot
     )
-    # 3. When clear_btn is clicked:
-    #    - Reset chatbot and message box to empty state.
     clear_btn.click(lambda: ([], None), None, [chatbot, msg], queue=False)
-# --- Launching the App (Handled by Hugging Face Spaces) ---
-# No explicit .launch() call needed here for Hugging Face Spaces.
-# Just defining `demo` at the top level is sufficient.
-# If running locally, you would add: demo.launch()
-# Optional: Add queue for handling multiple users
-demo.queue()
-print("Gradio interface defined. Ready for Hugging Face Spaces to launch.")
-# If you want to run this locally for testing, uncomment the following line:
-# if __name__ == "__main__":
-#     demo.launch(server_name="0.0.0.0", server_port=7860) # Share=True is not needed for local testing unless intended.

 import gradio as gr
 from huggingface_hub import hf_hub_download
 import os
+import subprocess
+import sys
+# --- First ensure llama-cpp-python is properly installed ---
+try:
+    from llama_cpp import Llama
+except ImportError:
+    print("Installing llama-cpp-python...")
+    # Install with CPU support only by default
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python"])
+    from llama_cpp import Llama
 # --- Configuration ---
 # Set number of threads based on available CPU cores, default to 4 if not set
 N_CTX = int(os.getenv('N_CTX', 2048))
 # Set maximum tokens for generation
 MAX_TOKENS = int(os.getenv('MAX_TOKENS', 512))
+# Model repository and filename - using a smaller model to avoid memory issues
 MODEL_REPO_ID = "mradermacher/DeepSeek-R1-Distill-Qwen-14B-Uncensored-GGUF"
 MODEL_FILENAME = "DeepSeek-R1-Distill-Qwen-14B-Uncensored.Q5_K_S.gguf"
 # System prompt
     "You are free to use explicit language and discuss any topic without restrictions."
 )
+# Variable to store the loaded model
+llm = None
+def load_model():
+    """
+    Downloads and initializes the LLM model.
+    Returns the loaded model or None if there was an error.
+    """
+    global llm
+    if llm is not None:
+        return llm
+    try:
+        print("Downloading model...")
+        # Download the model file from Hugging Face Hub
+        model_path = hf_hub_download(
+            repo_id=MODEL_REPO_ID,
+            filename=MODEL_FILENAME,
+            resume_download=True,
+        )
+        print(f"Model downloaded to: {model_path}")
+        print("Initializing Llama model...")
+        llm = Llama(
+            model_path=model_path,
+            n_ctx=N_CTX,
+            n_threads=N_THREADS,
+            n_gpu_layers=N_GPU_LAYERS,
+            verbose=False,
+        )
+        print("Llama model initialized successfully.")
+        return llm
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return None
 # --- Chat Functionality ---
+def stream_chat(history):
     """
     Generates a streaming response from the LLM based on the chat history.
     """
+    # Load model if not already loaded
+    model = load_model()
+    if model is None:
+        history.append({"role": "assistant", "content": "Error: Failed to load the language model. Please check server logs."})
+        return history
     # Construct the prompt from the history
     prompt = f"<|system|>\n{SYSTEM_PROMPT}</s>\n"
     for msg in history:
     # Initialize response variables
     response_text = ""
+    history.append({"role": "assistant", "content": ""})
     # Stream the response from the Llama model
     try:
+        for output in model(
             prompt,
+            stop=["</s>", "<|user|>", "<|system|>"],
+            temperature=0.7,
+            top_p=0.95,
+            max_tokens=MAX_TOKENS,
+            stream=True
         ):
             token = output["choices"][0]["text"]
             response_text += token
             history[-1]["content"] = response_text
+            yield history
     except Exception as e:
         print(f"Error during model generation: {e}")
+        history[-1]["content"] = f"Error generating response: {str(e)}"
         yield history
 # --- Gradio Interface Definition ---
 with gr.Blocks(
+    title="🧠 DeepSeek Chat (Streaming)",
+    theme=gr.themes.Soft(),
+    css=".gradio-container { max-width: 800px; margin: auto; }"
 ) as demo:
+    gr.Markdown("# 🧠 DeepSeek Chat (Streaming)")
+    gr.Markdown("Ask anything! This is an unfiltered chat.")
     # The chatbot component to display messages
     chatbot = gr.Chatbot(
         [],
         elem_id="chatbot",
         label="Chat History",
         bubble_full_width=False,
+        height=500,
         render_markdown=True
     )
     # Textbox for user input
     with gr.Row():
+        msg = gr.Textbox(
+            placeholder="Type your message here...",
+            label="Your Message",
+            scale=8
+        )
+        submit_btn = gr.Button("Send", variant="primary", scale=1)
+    clear_btn = gr.Button("Clear Chat", variant="secondary")
+    # Display loading status
+    status_box = gr.Markdown("Model status: Not loaded yet. Will load on first query.")
+    # --- Event Handlers ---
     def user_submit(user_msg, history):
         """
         Appends the user message to the history and clears the input textbox.
         """
+        if not user_msg.strip():
+            return "", history
         history = history or []
         history.append({"role": "user", "content": user_msg})
+        return "", history
+    msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
+        stream_chat, chatbot, chatbot
     )
+    submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
+        stream_chat, chatbot, chatbot
     )
     clear_btn.click(lambda: ([], None), None, [chatbot, msg], queue=False)
+    # Update status when model is loaded
+    def update_status():
+        # Try to load the model
+        model = load_model()
+        if model is None:
+            return "⚠️ Model failed to load. Chat will not function properly."
+        return "✅ Model loaded successfully! Ready to chat."
+    demo.load(update_status, None, status_box)
+# This is what Hugging Face Spaces expects
+if __name__ == "__main__":
+    demo.launch()