Spaces:
Sleeping
Sleeping
Commit
·
66d6d1f
1
Parent(s):
fc78ae4
- agent local model cpp
Browse files- utils/llama_cpp_model.py +62 -263
- utils/local_model.py +3 -2
utils/llama_cpp_model.py
CHANGED
@@ -1,296 +1,95 @@
|
|
1 |
-
"""
|
2 |
-
Fallback model implementation for testing when llama-cpp-python is not available.
|
3 |
-
|
4 |
-
This provides a compatible model class that doesn't require any external dependencies,
|
5 |
-
allowing the rest of the application to function while we solve the llama-cpp-python
|
6 |
-
installation issues.
|
7 |
-
"""
|
8 |
-
|
9 |
import os
|
10 |
import logging
|
11 |
-
from typing import Dict, List, Optional, Any
|
12 |
-
import requests
|
13 |
-
from smolagents import Model
|
14 |
from pathlib import Path
|
|
|
15 |
|
16 |
-
# Try to import llama_cpp, but don't fail if not available
|
17 |
try:
|
18 |
from llama_cpp import Llama
|
19 |
-
from pathlib import Path
|
20 |
LLAMA_CPP_AVAILABLE = True
|
21 |
except ImportError:
|
22 |
LLAMA_CPP_AVAILABLE = False
|
23 |
print("llama_cpp module not available, using fallback implementation")
|
24 |
|
25 |
-
logger = logging.getLogger(
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
class LlamaCppModel(Model):
|
28 |
-
|
29 |
-
Falls back to a simple text generation if llama_cpp is not available."""
|
30 |
-
def __init__(
|
31 |
-
self,
|
32 |
-
model_path: str = None,
|
33 |
-
model_url: str = None,
|
34 |
-
n_ctx: int = 2048,
|
35 |
-
n_gpu_layers: int = 0,
|
36 |
-
max_tokens: int = 512,
|
37 |
-
temperature: float = 0.7,
|
38 |
-
verbose: bool = True
|
39 |
-
):
|
40 |
-
"""
|
41 |
-
Initialize a local llama.cpp model or fallback to a simple implementation.
|
42 |
-
|
43 |
-
Args:
|
44 |
-
model_path: Path to local GGUF model file
|
45 |
-
model_url: URL to download model if model_path doesn't exist
|
46 |
-
n_ctx: Context window size
|
47 |
-
n_gpu_layers: Number of layers to offload to GPU (0 means CPU only)
|
48 |
-
max_tokens: Maximum new tokens to generate
|
49 |
-
temperature: Sampling temperature
|
50 |
-
verbose: Whether to print verbose messages
|
51 |
-
"""
|
52 |
super().__init__()
|
53 |
-
|
54 |
self.model_path = model_path
|
55 |
-
self.model_url = model_url
|
56 |
self.n_ctx = n_ctx
|
57 |
self.max_tokens = max_tokens
|
58 |
self.temperature = temperature
|
59 |
self.verbose = verbose
|
60 |
self.llm = None
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
verbose=verbose
|
79 |
-
)
|
80 |
-
|
81 |
-
if self.verbose:
|
82 |
-
print("LlamaCpp model loaded successfully")
|
83 |
-
else:
|
84 |
-
if self.verbose:
|
85 |
-
print(f"Model path not found or not specified. Using fallback mode.")
|
86 |
-
except Exception as e:
|
87 |
-
logger.error(f"Error initializing LlamaCpp model: {e}")
|
88 |
-
if self.verbose:
|
89 |
-
print(f"Error initializing LlamaCpp model: {e}")
|
90 |
-
self.llm = None
|
91 |
-
else:
|
92 |
-
if self.verbose:
|
93 |
-
print("LlamaCpp not available, using fallback implementation")
|
94 |
-
|
95 |
-
if not self.llm and self.verbose:
|
96 |
-
print("Using fallback text generation mode")
|
97 |
-
|
98 |
-
def _resolve_model_path(self, model_path: str = None, model_url: str = None) -> str:
|
99 |
-
"""
|
100 |
-
Resolve model path, downloading if necessary.
|
101 |
-
|
102 |
-
Returns:
|
103 |
-
Absolute path to model file
|
104 |
-
"""
|
105 |
-
# Default to a small model if none specified
|
106 |
-
if not model_path:
|
107 |
-
models_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models")
|
108 |
-
os.makedirs(models_dir, exist_ok=True)
|
109 |
-
model_path = os.path.join(models_dir, "ggml-model-q4_0.bin")
|
110 |
-
|
111 |
-
# Convert to Path for easier handling
|
112 |
-
path = Path(model_path)
|
113 |
-
|
114 |
-
# If model exists, return it
|
115 |
-
if path.exists():
|
116 |
-
return str(path.absolute())
|
117 |
-
|
118 |
-
# Download if URL provided
|
119 |
-
if model_url and not path.exists():
|
120 |
-
try:
|
121 |
-
print(f"Downloading model from {model_url}...")
|
122 |
-
os.makedirs(path.parent, exist_ok=True)
|
123 |
-
|
124 |
-
try:
|
125 |
-
# Try with streaming download first
|
126 |
-
with requests.get(model_url, stream=True, timeout=30) as r:
|
127 |
-
r.raise_for_status()
|
128 |
-
total_size = int(r.headers.get('content-length', 0))
|
129 |
-
block_size = 8192
|
130 |
-
|
131 |
-
with open(path, 'wb') as f:
|
132 |
-
downloaded = 0
|
133 |
-
for chunk in r.iter_content(chunk_size=block_size):
|
134 |
-
if chunk:
|
135 |
-
f.write(chunk)
|
136 |
-
downloaded += len(chunk)
|
137 |
-
if total_size > 0:
|
138 |
-
percent = (downloaded / total_size) * 100
|
139 |
-
if percent % 10 < (block_size / total_size) * 100:
|
140 |
-
print(f"Download progress: {int(percent)}%")
|
141 |
-
except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
|
142 |
-
print(f"Streaming download timed out: {e}. Using a simpler approach...")
|
143 |
-
# Fall back to simpler download method
|
144 |
-
r = requests.get(model_url, timeout=60)
|
145 |
-
r.raise_for_status()
|
146 |
-
with open(path, 'wb') as f:
|
147 |
-
f.write(r.content)
|
148 |
-
print("Download complete with simple method")
|
149 |
-
|
150 |
-
print(f"Model download complete: {path}")
|
151 |
-
return str(path.absolute())
|
152 |
-
except Exception as e:
|
153 |
-
logger.error(f"Error downloading model: {e}")
|
154 |
-
print(f"Error downloading model: {e}")
|
155 |
-
print("Continuing with dummy model instead...")
|
156 |
-
# Create a small dummy model file so we can continue
|
157 |
-
with open(path, 'wb') as f:
|
158 |
-
f.write(b"DUMMY MODEL")
|
159 |
-
return str(path.absolute())
|
160 |
-
|
161 |
-
# If we get here without a model, create a dummy one
|
162 |
-
print(f"Model file not found at {model_path} and no URL provided. Creating dummy model...")
|
163 |
-
os.makedirs(path.parent, exist_ok=True)
|
164 |
-
with open(path, 'wb') as f:
|
165 |
-
f.write(b"DUMMY MODEL")
|
166 |
-
return str(path.absolute())
|
167 |
-
|
168 |
def generate(self, prompt: str, **kwargs) -> str:
|
169 |
-
"""
|
170 |
-
Generate text completion for the given prompt.
|
171 |
-
|
172 |
-
Args:
|
173 |
-
prompt: Input text
|
174 |
-
|
175 |
-
Returns:
|
176 |
-
Generated text completion
|
177 |
-
"""
|
178 |
try:
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
if
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
max_tokens=self.max_tokens,
|
188 |
-
temperature=self.temperature,
|
189 |
-
echo=False # Don't include the prompt in the response
|
190 |
-
)
|
191 |
-
|
192 |
-
# Extract generated text
|
193 |
-
if not response:
|
194 |
-
return ""
|
195 |
-
|
196 |
-
if isinstance(response, dict):
|
197 |
-
generated_text = response.get('choices', [{}])[0].get('text', '')
|
198 |
-
else:
|
199 |
-
# List of responses
|
200 |
-
generated_text = response[0].get('text', '')
|
201 |
-
|
202 |
-
return generated_text.strip()
|
203 |
else:
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
words = prompt.strip().split()
|
210 |
-
last_words = ' '.join(words[-10:]) if len(words) > 10 else prompt
|
211 |
-
|
212 |
-
# Simple response generation based on prompt content
|
213 |
-
if "?" in prompt:
|
214 |
-
return f"Based on the information provided, I believe the answer is related to {last_words}. This is a fallback response as the LLM model could not be loaded."
|
215 |
-
else:
|
216 |
-
return f"I understand you're asking about {last_words}. Since I'm running in fallback mode without a proper language model, I can only acknowledge your query but not provide a detailed response."
|
217 |
-
|
218 |
except Exception as e:
|
219 |
-
logger.
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
def generate_with_tools(
|
225 |
-
self,
|
226 |
-
messages: List[Dict[str, Any]],
|
227 |
-
tools: Optional[List[Dict[str, Any]]] = None,
|
228 |
-
**kwargs
|
229 |
-
) -> Dict[str, Any]:
|
230 |
-
"""
|
231 |
-
Generate a response with tool-calling capabilities.
|
232 |
-
This method implements the smolagents Model interface for tool-calling.
|
233 |
-
|
234 |
-
Args:
|
235 |
-
messages: List of message objects with role and content
|
236 |
-
tools: List of tool definitions
|
237 |
-
|
238 |
-
Returns:
|
239 |
-
Response with message and optional tool calls
|
240 |
-
"""
|
241 |
try:
|
242 |
-
# Format messages into a prompt
|
243 |
prompt = self._format_messages_to_prompt(messages, tools)
|
244 |
-
|
245 |
-
# Generate response
|
246 |
completion = self.generate(prompt)
|
247 |
-
|
248 |
-
# For now, just return the text without tool parsing
|
249 |
-
return {
|
250 |
-
"message": {
|
251 |
-
"role": "assistant",
|
252 |
-
"content": completion
|
253 |
-
}
|
254 |
-
}
|
255 |
except Exception as e:
|
256 |
-
logger.
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
"role": "assistant",
|
261 |
-
"content": f"Error: {str(e)}"
|
262 |
-
}
|
263 |
-
}
|
264 |
-
|
265 |
-
def _format_messages_to_prompt(
|
266 |
-
self,
|
267 |
-
messages: List[Dict[str, Any]],
|
268 |
-
tools: Optional[List[Dict[str, Any]]] = None
|
269 |
-
) -> str:
|
270 |
-
"""Format chat messages into a text prompt for the model."""
|
271 |
formatted_prompt = ""
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
tool_descriptions = "\n".join([
|
276 |
-
f"Tool {i+1}: {tool['name']} - {tool['description']}"
|
277 |
-
for i, tool in enumerate(tools)
|
278 |
-
])
|
279 |
-
formatted_prompt += f"Available tools:\n{tool_descriptions}\n\n"
|
280 |
-
|
281 |
-
# Add conversation history
|
282 |
for msg in messages:
|
283 |
role = msg.get("role", "")
|
284 |
content = msg.get("content", "")
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
elif role == "user":
|
289 |
-
formatted_prompt += f"User: {content}\n\n"
|
290 |
-
elif role == "assistant":
|
291 |
-
formatted_prompt += f"Assistant: {content}\n\n"
|
292 |
-
|
293 |
-
# Add final prompt for assistant
|
294 |
formatted_prompt += "Assistant: "
|
295 |
-
|
296 |
-
return formatted_prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import logging
|
3 |
+
from typing import Dict, List, Optional, Any
|
|
|
|
|
4 |
from pathlib import Path
|
5 |
+
from smolagents import Model
|
6 |
|
|
|
7 |
try:
|
8 |
from llama_cpp import Llama
|
|
|
9 |
LLAMA_CPP_AVAILABLE = True
|
10 |
except ImportError:
|
11 |
LLAMA_CPP_AVAILABLE = False
|
12 |
print("llama_cpp module not available, using fallback implementation")
|
13 |
|
14 |
+
logger = logging.getLogger("LlamaCppModel")
|
15 |
+
logger.setLevel(logging.DEBUG)
|
16 |
+
ch = logging.StreamHandler()
|
17 |
+
ch.setLevel(logging.DEBUG)
|
18 |
+
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
19 |
+
ch.setFormatter(formatter)
|
20 |
+
logger.addHandler(ch)
|
21 |
|
22 |
class LlamaCppModel(Model):
|
23 |
+
def __init__(self, model_path: str, n_ctx: int = 2048, n_gpu_layers: int = 0, max_tokens: int = 512, temperature: float = 0.7, verbose: bool = True):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
super().__init__()
|
|
|
25 |
self.model_path = model_path
|
|
|
26 |
self.n_ctx = n_ctx
|
27 |
self.max_tokens = max_tokens
|
28 |
self.temperature = temperature
|
29 |
self.verbose = verbose
|
30 |
self.llm = None
|
31 |
+
|
32 |
+
if not LLAMA_CPP_AVAILABLE:
|
33 |
+
logger.error("llama_cpp is not installed. Please install with 'pip install llama-cpp-python'")
|
34 |
+
raise ImportError("llama_cpp is required but not installed.")
|
35 |
+
|
36 |
+
if not os.path.exists(model_path):
|
37 |
+
logger.error(f"Model file not found at: {model_path}")
|
38 |
+
raise FileNotFoundError(f"Model file not found at: {model_path}")
|
39 |
+
|
40 |
+
try:
|
41 |
+
logger.info(f"Loading Llama model from: {model_path}")
|
42 |
+
self.llm = Llama(model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers, verbose=verbose)
|
43 |
+
logger.info("Llama model loaded successfully.")
|
44 |
+
except Exception as e:
|
45 |
+
logger.exception(f"Failed to initialize Llama model: {e}")
|
46 |
+
raise
|
47 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
def generate(self, prompt: str, **kwargs) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
try:
|
50 |
+
logger.debug(f"Generating with prompt: {prompt[:100]}...")
|
51 |
+
response = self.llm(prompt=prompt, max_tokens=self.max_tokens, temperature=self.temperature, echo=False)
|
52 |
+
logger.debug(f"Raw response: {response}")
|
53 |
+
|
54 |
+
if isinstance(response, dict) and 'choices' in response:
|
55 |
+
text = response['choices'][0]['text'].strip()
|
56 |
+
elif isinstance(response, list):
|
57 |
+
text = response[0].get('text', '').strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
else:
|
59 |
+
logger.warning("Unexpected response format from Llama.")
|
60 |
+
text = str(response)
|
61 |
+
|
62 |
+
logger.debug(f"Generated text: {text}")
|
63 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
except Exception as e:
|
65 |
+
logger.exception(f"Error generating text: {e}")
|
66 |
+
return f"Error generating response: {e}"
|
67 |
+
|
68 |
+
def generate_with_tools(self, messages: List[Dict[str, Any]], tools: Optional[List[Dict[str, Any]]] = None, **kwargs) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
try:
|
|
|
70 |
prompt = self._format_messages_to_prompt(messages, tools)
|
71 |
+
logger.debug(f"Formatted prompt: {prompt}")
|
|
|
72 |
completion = self.generate(prompt)
|
73 |
+
return {"message": {"role": "assistant", "content": completion}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
except Exception as e:
|
75 |
+
logger.exception(f"Error generating with tools: {e}")
|
76 |
+
return {"message": {"role": "assistant", "content": f"Error: {e}"}}
|
77 |
+
|
78 |
+
def _format_messages_to_prompt(self, messages: List[Dict[str, Any]], tools: Optional[List[Dict[str, Any]]] = None) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
formatted_prompt = ""
|
80 |
+
if tools:
|
81 |
+
tool_desc = "\n".join([f"Tool {i+1}: {t['name']} - {t['description']}" for i, t in enumerate(tools)])
|
82 |
+
formatted_prompt += f"Available tools:\n{tool_desc}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
for msg in messages:
|
84 |
role = msg.get("role", "")
|
85 |
content = msg.get("content", "")
|
86 |
+
if isinstance(content, list):
|
87 |
+
content = " ".join([c.get("text", str(c)) if isinstance(c, dict) else str(c) for c in content])
|
88 |
+
formatted_prompt += f"{role.capitalize()}: {content}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
formatted_prompt += "Assistant: "
|
90 |
+
logger.debug(f"Constructed prompt: {formatted_prompt}")
|
91 |
+
return formatted_prompt
|
92 |
+
|
93 |
+
# Example usage (for testing):
|
94 |
+
# model = LlamaCppModel(model_path="/path/to/your/llama-model.gguf")
|
95 |
+
# print(model.generate("Hello, how are you?"))
|
utils/local_model.py
CHANGED
@@ -96,8 +96,9 @@ class LocalTransformersModel(Model):
|
|
96 |
return generated_text.strip()
|
97 |
|
98 |
except Exception as e:
|
99 |
-
|
100 |
-
|
|
|
101 |
return f"Error: {str(e)}"
|
102 |
|
103 |
def generate_with_tools(
|
|
|
96 |
return generated_text.strip()
|
97 |
|
98 |
except Exception as e:
|
99 |
+
error_msg = f"Error generating text (Local model): {e}"
|
100 |
+
logger.error(error_msg)
|
101 |
+
print(error_msg)
|
102 |
return f"Error: {str(e)}"
|
103 |
|
104 |
def generate_with_tools(
|