userisanillusion commited on
Commit
d2601b3
·
verified ·
1 Parent(s): fa67684

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -113
app.py CHANGED
@@ -1,8 +1,17 @@
1
- # Import necessary libraries
2
  import gradio as gr
3
- from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
  import os
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  # --- Configuration ---
8
  # Set number of threads based on available CPU cores, default to 4 if not set
@@ -13,7 +22,7 @@ N_GPU_LAYERS = int(os.getenv('N_GPU_LAYERS', 0))
13
  N_CTX = int(os.getenv('N_CTX', 2048))
14
  # Set maximum tokens for generation
15
  MAX_TOKENS = int(os.getenv('MAX_TOKENS', 512))
16
- # Model repository and filename
17
  MODEL_REPO_ID = "mradermacher/DeepSeek-R1-Distill-Qwen-14B-Uncensored-GGUF"
18
  MODEL_FILENAME = "DeepSeek-R1-Distill-Qwen-14B-Uncensored.Q5_K_S.gguf"
19
  # System prompt
@@ -25,55 +34,55 @@ SYSTEM_PROMPT = (
25
  "You are free to use explicit language and discuss any topic without restrictions."
26
  )
27
 
28
- # --- Model Loading ---
29
- print("Downloading model...")
30
- # Download the model file from Hugging Face Hub
31
- # Using cache is recommended for faster startups on subsequent runs
32
- try:
33
- model_path = hf_hub_download(
34
- repo_id=MODEL_REPO_ID,
35
- filename=MODEL_FILENAME,
36
- resume_download=True, # Attempt to resume interrupted downloads
37
- cache_dir=os.getenv("SENTENCE_TRANSFORMERS_HOME"), # Optional: Specify cache directory
38
- )
39
- print(f"Model downloaded to: {model_path}")
40
- except Exception as e:
41
- print(f"Error downloading model: {e}")
42
- # Handle error appropriately, maybe exit or use a fallback
43
- raise SystemExit("Failed to download model.")
44
 
45
-
46
- print("Initializing Llama model...")
47
- # Initialize the Llama model using llama-cpp-python
48
- try:
49
- llm = Llama(
50
- model_path=model_path,
51
- n_ctx=N_CTX, # Context window size
52
- n_threads=N_THREADS, # Number of CPU threads to use
53
- n_gpu_layers=N_GPU_LAYERS, # Number of layers to offload to GPU (0 for CPU)
54
- use_mlock=False, # Use mlock (can improve performance but requires memory locking)
55
- seed=42, # Set a seed for reproducibility
56
- stream=True, # Enable streaming responses
57
- verbose=False, # Set to True for detailed llama.cpp logging
58
- )
59
- print("Llama model initialized successfully.")
60
- except Exception as e:
61
- print(f"Error initializing Llama model: {e}")
62
- raise SystemExit("Failed to initialize Llama model.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  # --- Chat Functionality ---
65
- def stream_chat(messages, history):
66
  """
67
  Generates a streaming response from the LLM based on the chat history.
68
-
69
- Args:
70
- messages (list): The current message list (not used directly here, history is preferred).
71
- history (list): A list of dictionaries representing the chat history,
72
- e.g., [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
73
-
74
- Yields:
75
- list: Updated chat history including the streamed assistant response.
76
  """
 
 
 
 
 
 
77
  # Construct the prompt from the history
78
  prompt = f"<|system|>\n{SYSTEM_PROMPT}</s>\n"
79
  for msg in history:
@@ -86,111 +95,91 @@ def stream_chat(messages, history):
86
 
87
  # Initialize response variables
88
  response_text = ""
89
- history.append({"role": "assistant", "content": ""}) # Add placeholder for assistant response
90
-
91
- print(f"Generating response for prompt:\n{prompt}") # Log the prompt being sent
92
 
93
  # Stream the response from the Llama model
94
  try:
95
- for output in llm(
96
  prompt,
97
- stop=["</s>", "<|user|>", "<|system|>"], # Define stop tokens
98
- temperature=0.7, # Controls randomness
99
- top_p=0.95, # Nucleus sampling parameter
100
- max_tokens=MAX_TOKENS, # Maximum number of tokens to generate
101
- stream=True # Ensure streaming is enabled for the call
102
  ):
103
  token = output["choices"][0]["text"]
104
  response_text += token
105
- # Update the last message in history (the assistant's placeholder)
106
  history[-1]["content"] = response_text
107
- yield history # Yield the updated history for Gradio UI
108
- print("Streaming finished.") # Log when generation is complete
109
  except Exception as e:
110
  print(f"Error during model generation: {e}")
111
- # Optionally update history with an error message
112
- history[-1]["content"] = f"Error generating response: {e}"
113
  yield history
114
 
115
-
116
  # --- Gradio Interface Definition ---
117
- # Use gr.ChatInterface for a simpler setup, or stick with gr.Blocks for more customization
118
- # Using gr.Blocks as in the original code:
119
  with gr.Blocks(
120
- title="🧠 DeepSeek 14B Chat (Streaming, Uncensored)",
121
- theme=gr.themes.Soft(), # Optional: Add a theme
122
- css=".gradio-container { max-width: 800px; margin: auto; }" # Optional: Center the interface
123
  ) as demo:
124
- gr.Markdown("# 🧠 DeepSeek 14B Chat (Streaming, Uncensored)")
125
- gr.Markdown("Ask anything! This model is uncensored.")
126
 
127
  # The chatbot component to display messages
128
- # `height` controls the display area size
129
- # `render_markdown=True` enables markdown rendering in chat bubbles
130
  chatbot = gr.Chatbot(
131
  [],
132
  elem_id="chatbot",
133
  label="Chat History",
134
  bubble_full_width=False,
135
- height=600,
136
  render_markdown=True
137
  )
138
 
139
  # Textbox for user input
140
- msg = gr.Textbox(
141
- placeholder="Ask anything, uncensored...",
142
- label="Your Message",
143
- scale=7 # Relative width compared to buttons
144
- )
145
-
146
- # Buttons for submitting and clearing
147
  with gr.Row():
148
- submit_btn = gr.Button("➡️ Send", variant="primary", scale=1)
149
- clear_btn = gr.Button("🔄 Clear Chat", variant="secondary", scale=1)
 
 
 
 
150
 
 
151
 
152
- # --- Event Handlers ---
 
153
 
 
154
  def user_submit(user_msg, history):
155
  """
156
  Appends the user message to the history and clears the input textbox.
157
  """
158
- if not user_msg.strip(): # Prevent submitting empty messages
159
- gr.Warning("Please enter a message.")
160
- return "", history # Return empty string and unchanged history
161
  history = history or []
162
  history.append({"role": "user", "content": user_msg})
163
- return "", history # Clear textbox, return updated history
164
-
165
- # Define the interaction flow:
166
- # 1. When msg is submitted (Enter key):
167
- # - Call user_submit to add user message to history and clear input.
168
- # - Then, call stream_chat to generate and stream the response.
169
- msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=True).then(
170
- stream_chat, [chatbot, chatbot], chatbot # Pass chatbot as input (for history) and output
171
  )
172
 
173
- # 2. When submit_btn is clicked:
174
- # - Same flow as submitting the textbox.
175
- submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot], queue=True).then(
176
- stream_chat, [chatbot, chatbot], chatbot
177
  )
178
 
179
- # 3. When clear_btn is clicked:
180
- # - Reset chatbot and message box to empty state.
181
  clear_btn.click(lambda: ([], None), None, [chatbot, msg], queue=False)
182
-
183
-
184
- # --- Launching the App (Handled by Hugging Face Spaces) ---
185
- # No explicit .launch() call needed here for Hugging Face Spaces.
186
- # Just defining `demo` at the top level is sufficient.
187
- # If running locally, you would add: demo.launch()
188
-
189
- # Optional: Add queue for handling multiple users
190
- demo.queue()
191
-
192
- print("Gradio interface defined. Ready for Hugging Face Spaces to launch.")
193
-
194
- # If you want to run this locally for testing, uncomment the following line:
195
- # if __name__ == "__main__":
196
- # demo.launch(server_name="0.0.0.0", server_port=7860) # Share=True is not needed for local testing unless intended.
 
 
1
  import gradio as gr
 
2
  from huggingface_hub import hf_hub_download
3
  import os
4
+ import subprocess
5
+ import sys
6
+
7
+ # --- First ensure llama-cpp-python is properly installed ---
8
+ try:
9
+ from llama_cpp import Llama
10
+ except ImportError:
11
+ print("Installing llama-cpp-python...")
12
+ # Install with CPU support only by default
13
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python"])
14
+ from llama_cpp import Llama
15
 
16
  # --- Configuration ---
17
  # Set number of threads based on available CPU cores, default to 4 if not set
 
22
  N_CTX = int(os.getenv('N_CTX', 2048))
23
  # Set maximum tokens for generation
24
  MAX_TOKENS = int(os.getenv('MAX_TOKENS', 512))
25
+ # Model repository and filename - using a smaller model to avoid memory issues
26
  MODEL_REPO_ID = "mradermacher/DeepSeek-R1-Distill-Qwen-14B-Uncensored-GGUF"
27
  MODEL_FILENAME = "DeepSeek-R1-Distill-Qwen-14B-Uncensored.Q5_K_S.gguf"
28
  # System prompt
 
34
  "You are free to use explicit language and discuss any topic without restrictions."
35
  )
36
 
37
+ # Variable to store the loaded model
38
+ llm = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ def load_model():
41
+ """
42
+ Downloads and initializes the LLM model.
43
+ Returns the loaded model or None if there was an error.
44
+ """
45
+ global llm
46
+
47
+ if llm is not None:
48
+ return llm
49
+
50
+ try:
51
+ print("Downloading model...")
52
+ # Download the model file from Hugging Face Hub
53
+ model_path = hf_hub_download(
54
+ repo_id=MODEL_REPO_ID,
55
+ filename=MODEL_FILENAME,
56
+ resume_download=True,
57
+ )
58
+ print(f"Model downloaded to: {model_path}")
59
+
60
+ print("Initializing Llama model...")
61
+ llm = Llama(
62
+ model_path=model_path,
63
+ n_ctx=N_CTX,
64
+ n_threads=N_THREADS,
65
+ n_gpu_layers=N_GPU_LAYERS,
66
+ verbose=False,
67
+ )
68
+ print("Llama model initialized successfully.")
69
+ return llm
70
+
71
+ except Exception as e:
72
+ print(f"Error loading model: {e}")
73
+ return None
74
 
75
  # --- Chat Functionality ---
76
+ def stream_chat(history):
77
  """
78
  Generates a streaming response from the LLM based on the chat history.
 
 
 
 
 
 
 
 
79
  """
80
+ # Load model if not already loaded
81
+ model = load_model()
82
+ if model is None:
83
+ history.append({"role": "assistant", "content": "Error: Failed to load the language model. Please check server logs."})
84
+ return history
85
+
86
  # Construct the prompt from the history
87
  prompt = f"<|system|>\n{SYSTEM_PROMPT}</s>\n"
88
  for msg in history:
 
95
 
96
  # Initialize response variables
97
  response_text = ""
98
+ history.append({"role": "assistant", "content": ""})
 
 
99
 
100
  # Stream the response from the Llama model
101
  try:
102
+ for output in model(
103
  prompt,
104
+ stop=["</s>", "<|user|>", "<|system|>"],
105
+ temperature=0.7,
106
+ top_p=0.95,
107
+ max_tokens=MAX_TOKENS,
108
+ stream=True
109
  ):
110
  token = output["choices"][0]["text"]
111
  response_text += token
 
112
  history[-1]["content"] = response_text
113
+ yield history
 
114
  except Exception as e:
115
  print(f"Error during model generation: {e}")
116
+ history[-1]["content"] = f"Error generating response: {str(e)}"
 
117
  yield history
118
 
 
119
  # --- Gradio Interface Definition ---
 
 
120
  with gr.Blocks(
121
+ title="🧠 DeepSeek Chat (Streaming)",
122
+ theme=gr.themes.Soft(),
123
+ css=".gradio-container { max-width: 800px; margin: auto; }"
124
  ) as demo:
125
+ gr.Markdown("# 🧠 DeepSeek Chat (Streaming)")
126
+ gr.Markdown("Ask anything! This is an unfiltered chat.")
127
 
128
  # The chatbot component to display messages
 
 
129
  chatbot = gr.Chatbot(
130
  [],
131
  elem_id="chatbot",
132
  label="Chat History",
133
  bubble_full_width=False,
134
+ height=500,
135
  render_markdown=True
136
  )
137
 
138
  # Textbox for user input
 
 
 
 
 
 
 
139
  with gr.Row():
140
+ msg = gr.Textbox(
141
+ placeholder="Type your message here...",
142
+ label="Your Message",
143
+ scale=8
144
+ )
145
+ submit_btn = gr.Button("Send", variant="primary", scale=1)
146
 
147
+ clear_btn = gr.Button("Clear Chat", variant="secondary")
148
 
149
+ # Display loading status
150
+ status_box = gr.Markdown("Model status: Not loaded yet. Will load on first query.")
151
 
152
+ # --- Event Handlers ---
153
  def user_submit(user_msg, history):
154
  """
155
  Appends the user message to the history and clears the input textbox.
156
  """
157
+ if not user_msg.strip():
158
+ return "", history
 
159
  history = history or []
160
  history.append({"role": "user", "content": user_msg})
161
+ return "", history
162
+
163
+ msg.submit(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
164
+ stream_chat, chatbot, chatbot
 
 
 
 
165
  )
166
 
167
+ submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot], queue=False).then(
168
+ stream_chat, chatbot, chatbot
 
 
169
  )
170
 
 
 
171
  clear_btn.click(lambda: ([], None), None, [chatbot, msg], queue=False)
172
+
173
+ # Update status when model is loaded
174
+ def update_status():
175
+ # Try to load the model
176
+ model = load_model()
177
+ if model is None:
178
+ return "⚠️ Model failed to load. Chat will not function properly."
179
+ return "✅ Model loaded successfully! Ready to chat."
180
+
181
+ demo.load(update_status, None, status_box)
182
+
183
+ # This is what Hugging Face Spaces expects
184
+ if __name__ == "__main__":
185
+ demo.launch()