MohammedSameerSyed commited on
Commit
0664808
·
verified ·
1 Parent(s): 9639342

using gguf from own repo

Browse files
Files changed (1) hide show
  1. utils/text_model.py +365 -364
utils/text_model.py CHANGED
@@ -1,365 +1,366 @@
1
- import os
2
- import threading
3
- import torch
4
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
5
-
6
- from transformers.generation.utils import DynamicCache
7
- DynamicCache.get_max_length = DynamicCache.get_max_cache_shape
8
-
9
-
10
- # Check if llama-cpp-python is available
11
- def check_llamacpp_available():
12
- try:
13
- import llama_cpp
14
- return True
15
- except ImportError:
16
- return False
17
-
18
- # Global cache for model and tokenizer
19
- MODEL_CACHE = {}
20
-
21
- def load_text_model(model_name, quantize=False):
22
- """
23
- Load text model with appropriate configuration for CPU or GPU
24
-
25
- Args:
26
- model_name (str): Hugging Face model ID
27
- quantize (bool): Whether to use 4-bit quantization (only works with GPU)
28
-
29
- Returns:
30
- tuple: (model, tokenizer)
31
- """
32
- # Check cache first
33
- cache_key = f"{model_name}_{quantize}"
34
- if cache_key in MODEL_CACHE:
35
- return MODEL_CACHE[cache_key]
36
-
37
- # Check CUDA availability
38
- cuda_available = torch.cuda.is_available()
39
-
40
- # Only try quantization if CUDA is available
41
- if quantize and cuda_available:
42
- try:
43
- quantization_config = BitsAndBytesConfig(
44
- load_in_4bit=True,
45
- bnb_4bit_compute_dtype=torch.float16,
46
- bnb_4bit_quant_type="nf4",
47
- bnb_4bit_use_double_quant=True
48
- )
49
- except Exception as e:
50
- print(f"Quantization config creation failed: {e}")
51
- quantization_config = None
52
- quantize = False
53
- else:
54
- quantization_config = None
55
- quantize = False
56
-
57
- # Try loading the model
58
- try:
59
- tokenizer = AutoTokenizer.from_pretrained(model_name)
60
-
61
- # Fix for attention mask warning
62
- if tokenizer.pad_token is None:
63
- tokenizer.pad_token = tokenizer.eos_token
64
-
65
- # Try with quantization first if requested and available
66
- if quantize and quantization_config:
67
- try:
68
- model = AutoModelForCausalLM.from_pretrained(
69
- model_name,
70
- quantization_config=quantization_config,
71
- device_map="auto",
72
- trust_remote_code=True
73
- )
74
- except Exception as e:
75
- print(f"Failed to load with quantization: {e}")
76
- quantize = False
77
-
78
- # If quantization is not used or failed, try standard loading
79
- if not quantize:
80
- # For CPU, just load without specifing dtype
81
- if not cuda_available:
82
- model = AutoModelForCausalLM.from_pretrained(
83
- model_name,
84
- device_map="auto",
85
- trust_remote_code=True
86
- )
87
- else:
88
- # Try different dtypes for GPU
89
- for dtype in (torch.float16, torch.float32):
90
- try:
91
- model = AutoModelForCausalLM.from_pretrained(
92
- model_name,
93
- torch_dtype=dtype,
94
- device_map="auto",
95
- trust_remote_code=True
96
- )
97
- break
98
- except Exception as e:
99
- if dtype == torch.float32:
100
- # Last resort: try without specifying dtype
101
- model = AutoModelForCausalLM.from_pretrained(
102
- model_name,
103
- device_map="auto",
104
- trust_remote_code=True
105
- )
106
-
107
- # Cache the loaded model and tokenizer
108
- MODEL_CACHE[cache_key] = (model, tokenizer)
109
- return model, tokenizer
110
-
111
- except Exception as e:
112
- raise RuntimeError(f"Failed to load model {model_name}: {e}")
113
-
114
- def format_prompt(tokenizer, query):
115
- """
116
- Format prompt according to model's requirements
117
-
118
- Args:
119
- tokenizer: The model tokenizer
120
- query (str): User query
121
-
122
- Returns:
123
- str: Formatted prompt
124
- """
125
- enhanced_query = f"Please answer this question about pharmaceuticals or medical topics.\n\nQuestion: {query}"
126
-
127
- # Use chat template if available
128
- if hasattr(tokenizer, "apply_chat_template") and callable(getattr(tokenizer, "apply_chat_template")):
129
- messages = [{"role": "user", "content": enhanced_query}]
130
- try:
131
- formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
132
- return formatted
133
- except:
134
- # Fallback if chat template fails
135
- pass
136
-
137
- # Simple formatting fallback
138
- return f"User: {enhanced_query}\nAssistant:"
139
-
140
- def generate_text_with_transformers(model, tokenizer, query, max_tokens=512, temperature=0.7,
141
- top_p=0.9, repetition_penalty=1.1, cancel_event=None,
142
- progress_callback=None):
143
- """
144
- Generate text using the transformers pipeline
145
-
146
- Args:
147
- model: The language model
148
- tokenizer: The tokenizer
149
- query (str): User query
150
- max_tokens (int): Maximum tokens to generate
151
- temperature (float): Temperature for sampling
152
- top_p (float): Top-p sampling parameter
153
- repetition_penalty (float): Penalty for repetition
154
- cancel_event (threading.Event): Event to signal cancellation
155
- progress_callback (callable): Function to report progress
156
-
157
- Returns:
158
- str: Generated response
159
- """
160
- # Format the prompt
161
- prompt = format_prompt(tokenizer, query)
162
-
163
- # Prepare inputs
164
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
165
-
166
- # Update progress
167
- if progress_callback:
168
- progress_callback(0.2, "Starting generation...")
169
-
170
- try:
171
- from transformers import TextIteratorStreamer
172
-
173
- # Set up streamer for token-by-token generation
174
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
175
-
176
- # Prepare generation parameters
177
- generation_kwargs = {
178
- "input_ids": inputs.input_ids,
179
- "attention_mask": inputs.attention_mask, # Explicitly provide attention mask
180
- "max_new_tokens": max_tokens,
181
- "temperature": temperature,
182
- "top_p": top_p,
183
- "repetition_penalty": repetition_penalty,
184
- "do_sample": temperature > 0.1,
185
- "streamer": streamer
186
- }
187
-
188
- # Start generation in a separate thread
189
- generation_thread = threading.Thread(
190
- target=model.generate,
191
- kwargs=generation_kwargs
192
- )
193
- generation_thread.start()
194
-
195
- # Collect tokens as they're generated
196
- response_text = ""
197
-
198
- for i, new_text in enumerate(streamer):
199
- if cancel_event and cancel_event.is_set():
200
- break
201
-
202
- response_text += new_text
203
-
204
- # Update progress periodically
205
- if progress_callback and i % 5 == 0:
206
- progress_callback(0.3 + min(0.6, len(response_text) / 500), "Generating response...")
207
-
208
- return response_text
209
-
210
- except Exception as e:
211
- print(f"Streaming generation failed, falling back to standard generation: {e}")
212
- # Fallback to standard generation
213
- try:
214
- outputs = model.generate(
215
- inputs.input_ids,
216
- attention_mask=inputs.attention_mask,
217
- max_new_tokens=max_tokens,
218
- temperature=temperature,
219
- top_p=top_p,
220
- repetition_penalty=repetition_penalty,
221
- do_sample=temperature > 0.1,
222
- )
223
-
224
- # Decode and remove prompt
225
- prompt_length = inputs.input_ids.shape[1]
226
- response = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
227
-
228
- return response
229
- except Exception as e2:
230
- return f"Error in text generation: {e2}"
231
-
232
- # Global llamacpp model cache
233
- LLAMA_MODEL = None
234
-
235
- def load_llamacpp_model(model_path=None):
236
- """Load the llama.cpp model"""
237
- global LLAMA_MODEL
238
-
239
- # Return cached model if available
240
- if LLAMA_MODEL is not None:
241
- return LLAMA_MODEL
242
-
243
- try:
244
- from llama_cpp import Llama
245
-
246
- # Use provided path or check for model in predefined locations
247
- if model_path is None:
248
- # Try to find model in standard locations
249
- possible_paths = [
250
- "models/Phi-3-mini-4k-instruct.Q4_K_M.gguf", # Local models dir
251
- os.path.join(os.path.dirname(os.path.dirname(__file__)), "models/Phi-3-mini-4k-instruct.Q4_K_M.gguf"), # Project root
252
- "/models/Phi-3-mini-4k-instruct.Q4_K_M.gguf", # Docker container
253
- os.path.expanduser("~/.cache/huggingface/hub/models/Phi-3-mini-4k-instruct.Q4_K_M.gguf") # HF cache
254
- ]
255
-
256
- for path in possible_paths:
257
- if os.path.exists(path):
258
- model_path = path
259
- break
260
-
261
- if model_path is None:
262
- raise FileNotFoundError("Could not find GGUF model file. Please provide the path explicitly.")
263
-
264
- # Load the model
265
- LLAMA_MODEL = Llama(
266
- model_path=model_path,
267
- n_ctx=4096, # Context window size
268
- n_batch=512, # Batch size for prompt processing
269
- n_threads=4, # CPU threads to use
270
- n_gpu_layers=0 # Set higher if you have GPU
271
- )
272
-
273
- return LLAMA_MODEL
274
-
275
- except ImportError:
276
- raise ImportError("llama-cpp-python is not installed. Please install it to use this functionality.")
277
- except Exception as e:
278
- raise RuntimeError(f"Failed to load llama.cpp model: {e}")
279
-
280
- def generate_text_with_llamacpp(query, max_tokens=512, temperature=0.7, top_p=0.9,
281
- stop=None, cancel_event=None, progress_callback=None, model_path=None):
282
- """
283
- Generate text using llama.cpp
284
-
285
- Args:
286
- query (str): User query
287
- max_tokens (int): Maximum tokens to generate
288
- temperature (float): Temperature for sampling
289
- top_p (float): Top-p sampling parameter
290
- stop (list): List of stop sequences
291
- cancel_event (threading.Event): Event to signal cancellation
292
- progress_callback (callable): Function to report progress
293
- model_path (str): Path to GGUF model file (optional)
294
-
295
- Returns:
296
- str: Generated response
297
- """
298
- if progress_callback:
299
- progress_callback(0.1, "Loading llama.cpp model...")
300
-
301
- # Load model
302
- try:
303
- model = load_llamacpp_model(model_path)
304
- except Exception as e:
305
- raise RuntimeError(f"Failed to load llama.cpp model: {e}")
306
-
307
- if progress_callback:
308
- progress_callback(0.3, "Starting generation...")
309
-
310
- # Format prompt
311
- prompt = f"You are a helpful pharmaceutical assistant. Please answer this question about medications or medical topics.\n\nQuestion: {query}\n\nAnswer:"
312
-
313
- # Define stop sequences if not provided
314
- if stop is None:
315
- stop = ["Question:", "\n\n"]
316
-
317
- try:
318
- # Check if create_completion method exists (newer versions)
319
- if hasattr(model, "create_completion"):
320
- # Stream response
321
- response_text = ""
322
-
323
- # Generate completion with streaming
324
- stream = model.create_completion(
325
- prompt,
326
- max_tokens=1024,
327
- temperature=temperature,
328
- top_p=top_p,
329
- top_k=40,
330
- stop=None,
331
- stream=True
332
- )
333
-
334
- # Process stream
335
- for i, chunk in enumerate(stream):
336
- if cancel_event and cancel_event.is_set():
337
- break
338
-
339
- text_chunk = chunk["choices"][0]["text"]
340
- response_text += text_chunk
341
-
342
- # Update progress periodically
343
- if progress_callback and i % 5 == 0:
344
- progress_callback(0.4 + min(0.5, len(response_text) / 500), "Generating response...")
345
-
346
- return response_text.strip()
347
- else:
348
- # Fallback to older call method
349
- result = model(
350
- prompt,
351
- max_tokens=max_tokens,
352
- temperature=temperature,
353
- top_p=top_p,
354
- top_k=40,
355
- stop=stop,
356
- echo=False
357
- )
358
-
359
- if progress_callback:
360
- progress_callback(0.9, "Finalizing...")
361
-
362
- return result["choices"][0]["text"].strip()
363
-
364
- except Exception as e:
 
365
  raise RuntimeError(f"Error in llama.cpp generation: {e}")
 
1
+ import os
2
+ import threading
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
5
+
6
+ from transformers.generation.utils import DynamicCache
7
+ DynamicCache.get_max_length = DynamicCache.get_max_cache_shape
8
+
9
+
10
+ # Check if llama-cpp-python is available
11
+ def check_llamacpp_available():
12
+ try:
13
+ import llama_cpp
14
+ return True
15
+ except ImportError:
16
+ return False
17
+
18
+ # Global cache for model and tokenizer
19
+ MODEL_CACHE = {}
20
+
21
+ def load_text_model(model_name, quantize=False):
22
+ """
23
+ Load text model with appropriate configuration for CPU or GPU
24
+
25
+ Args:
26
+ model_name (str): Hugging Face model ID
27
+ quantize (bool): Whether to use 4-bit quantization (only works with GPU)
28
+
29
+ Returns:
30
+ tuple: (model, tokenizer)
31
+ """
32
+ # Check cache first
33
+ cache_key = f"{model_name}_{quantize}"
34
+ if cache_key in MODEL_CACHE:
35
+ return MODEL_CACHE[cache_key]
36
+
37
+ # Check CUDA availability
38
+ cuda_available = torch.cuda.is_available()
39
+
40
+ # Only try quantization if CUDA is available
41
+ if quantize and cuda_available:
42
+ try:
43
+ quantization_config = BitsAndBytesConfig(
44
+ load_in_4bit=True,
45
+ bnb_4bit_compute_dtype=torch.float16,
46
+ bnb_4bit_quant_type="nf4",
47
+ bnb_4bit_use_double_quant=True
48
+ )
49
+ except Exception as e:
50
+ print(f"Quantization config creation failed: {e}")
51
+ quantization_config = None
52
+ quantize = False
53
+ else:
54
+ quantization_config = None
55
+ quantize = False
56
+
57
+ # Try loading the model
58
+ try:
59
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
60
+
61
+ # Fix for attention mask warning
62
+ if tokenizer.pad_token is None:
63
+ tokenizer.pad_token = tokenizer.eos_token
64
+
65
+ # Try with quantization first if requested and available
66
+ if quantize and quantization_config:
67
+ try:
68
+ model = AutoModelForCausalLM.from_pretrained(
69
+ model_name,
70
+ quantization_config=quantization_config,
71
+ device_map="auto",
72
+ trust_remote_code=True
73
+ )
74
+ except Exception as e:
75
+ print(f"Failed to load with quantization: {e}")
76
+ quantize = False
77
+
78
+ # If quantization is not used or failed, try standard loading
79
+ if not quantize:
80
+ # For CPU, just load without specifing dtype
81
+ if not cuda_available:
82
+ model = AutoModelForCausalLM.from_pretrained(
83
+ model_name,
84
+ device_map="auto",
85
+ trust_remote_code=True
86
+ )
87
+ else:
88
+ # Try different dtypes for GPU
89
+ for dtype in (torch.float16, torch.float32):
90
+ try:
91
+ model = AutoModelForCausalLM.from_pretrained(
92
+ model_name,
93
+ torch_dtype=dtype,
94
+ device_map="auto",
95
+ trust_remote_code=True
96
+ )
97
+ break
98
+ except Exception as e:
99
+ if dtype == torch.float32:
100
+ # Last resort: try without specifying dtype
101
+ model = AutoModelForCausalLM.from_pretrained(
102
+ model_name,
103
+ device_map="auto",
104
+ trust_remote_code=True
105
+ )
106
+
107
+ # Cache the loaded model and tokenizer
108
+ MODEL_CACHE[cache_key] = (model, tokenizer)
109
+ return model, tokenizer
110
+
111
+ except Exception as e:
112
+ raise RuntimeError(f"Failed to load model {model_name}: {e}")
113
+
114
+ def format_prompt(tokenizer, query):
115
+ """
116
+ Format prompt according to model's requirements
117
+
118
+ Args:
119
+ tokenizer: The model tokenizer
120
+ query (str): User query
121
+
122
+ Returns:
123
+ str: Formatted prompt
124
+ """
125
+ enhanced_query = f"Please answer this question about pharmaceuticals or medical topics.\n\nQuestion: {query}"
126
+
127
+ # Use chat template if available
128
+ if hasattr(tokenizer, "apply_chat_template") and callable(getattr(tokenizer, "apply_chat_template")):
129
+ messages = [{"role": "user", "content": enhanced_query}]
130
+ try:
131
+ formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
132
+ return formatted
133
+ except:
134
+ # Fallback if chat template fails
135
+ pass
136
+
137
+ # Simple formatting fallback
138
+ return f"User: {enhanced_query}\nAssistant:"
139
+
140
+ def generate_text_with_transformers(model, tokenizer, query, max_tokens=512, temperature=0.7,
141
+ top_p=0.9, repetition_penalty=1.1, cancel_event=None,
142
+ progress_callback=None):
143
+ """
144
+ Generate text using the transformers pipeline
145
+
146
+ Args:
147
+ model: The language model
148
+ tokenizer: The tokenizer
149
+ query (str): User query
150
+ max_tokens (int): Maximum tokens to generate
151
+ temperature (float): Temperature for sampling
152
+ top_p (float): Top-p sampling parameter
153
+ repetition_penalty (float): Penalty for repetition
154
+ cancel_event (threading.Event): Event to signal cancellation
155
+ progress_callback (callable): Function to report progress
156
+
157
+ Returns:
158
+ str: Generated response
159
+ """
160
+ # Format the prompt
161
+ prompt = format_prompt(tokenizer, query)
162
+
163
+ # Prepare inputs
164
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
165
+
166
+ # Update progress
167
+ if progress_callback:
168
+ progress_callback(0.2, "Starting generation...")
169
+
170
+ try:
171
+ from transformers import TextIteratorStreamer
172
+
173
+ # Set up streamer for token-by-token generation
174
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
175
+
176
+ # Prepare generation parameters
177
+ generation_kwargs = {
178
+ "input_ids": inputs.input_ids,
179
+ "attention_mask": inputs.attention_mask, # Explicitly provide attention mask
180
+ "max_new_tokens": max_tokens,
181
+ "temperature": temperature,
182
+ "top_p": top_p,
183
+ "repetition_penalty": repetition_penalty,
184
+ "do_sample": temperature > 0.1,
185
+ "streamer": streamer
186
+ }
187
+
188
+ # Start generation in a separate thread
189
+ generation_thread = threading.Thread(
190
+ target=model.generate,
191
+ kwargs=generation_kwargs
192
+ )
193
+ generation_thread.start()
194
+
195
+ # Collect tokens as they're generated
196
+ response_text = ""
197
+
198
+ for i, new_text in enumerate(streamer):
199
+ if cancel_event and cancel_event.is_set():
200
+ break
201
+
202
+ response_text += new_text
203
+
204
+ # Update progress periodically
205
+ if progress_callback and i % 5 == 0:
206
+ progress_callback(0.3 + min(0.6, len(response_text) / 500), "Generating response...")
207
+
208
+ return response_text
209
+
210
+ except Exception as e:
211
+ print(f"Streaming generation failed, falling back to standard generation: {e}")
212
+ # Fallback to standard generation
213
+ try:
214
+ outputs = model.generate(
215
+ inputs.input_ids,
216
+ attention_mask=inputs.attention_mask,
217
+ max_new_tokens=max_tokens,
218
+ temperature=temperature,
219
+ top_p=top_p,
220
+ repetition_penalty=repetition_penalty,
221
+ do_sample=temperature > 0.1,
222
+ )
223
+
224
+ # Decode and remove prompt
225
+ prompt_length = inputs.input_ids.shape[1]
226
+ response = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
227
+
228
+ return response
229
+ except Exception as e2:
230
+ return f"Error in text generation: {e2}"
231
+
232
+ # Global llamacpp model cache
233
+ LLAMA_MODEL = None
234
+
235
+ from llama_cpp import Llama
236
+
237
+ def load_llamacpp_model(model_path=None):
238
+ """Load the llama.cpp model, downloading from HF Hub if needed."""
239
+ global LLAMA_MODEL
240
+
241
+ # Return cached model if available
242
+ if LLAMA_MODEL is not None:
243
+ return LLAMA_MODEL
244
+
245
+ # 1) Look for existing file on disk
246
+ if model_path is None:
247
+ possible_paths = [
248
+ "models/Phi-3-mini-4k-instruct.Q4_K_M.gguf",
249
+ os.path.join(os.path.dirname(os.path.dirname(__file__)), "models/Phi-3-mini-4k-instruct.Q4_K_M.gguf"),
250
+ "/models/Phi-3-mini-4k-instruct.Q4_K_M.gguf",
251
+ os.path.expanduser("~/.cache/huggingface/hub/models/Phi-3-mini-4k-instruct.Q4_K_M.gguf"),
252
+ ]
253
+ for p in possible_paths:
254
+ if os.path.exists(p):
255
+ model_path = p
256
+ break
257
+
258
+ # 2) If still not found, download into models/
259
+ if model_path is None:
260
+ print("→ GGUF not found locally, downloading from HF Hub…")
261
+ model_path = hf_hub_download(
262
+ repo_id="MohammedSameerSyed/phi3-gguf", # <— YOUR HF repo with the .gguf
263
+ filename="Phi-3-mini-4k-instruct.Q4_K_M.gguf",
264
+ cache_dir="models", # will create models/ if needed
265
+ )
266
+
267
+ # 3) Finally load with llama.cpp
268
+ try:
269
+ LLAMA_MODEL = Llama(
270
+ model_path=model_path,
271
+ n_ctx=4096, # full 4K context
272
+ n_batch=512,
273
+ n_threads=4,
274
+ n_gpu_layers=0
275
+ )
276
+ return LLAMA_MODEL
277
+
278
+ except Exception as e:
279
+ raise RuntimeError(f"Failed to load llama.cpp model: {e}")
280
+
281
+ def generate_text_with_llamacpp(query, max_tokens=512, temperature=0.7, top_p=0.9,
282
+ stop=None, cancel_event=None, progress_callback=None, model_path=None):
283
+ """
284
+ Generate text using llama.cpp
285
+
286
+ Args:
287
+ query (str): User query
288
+ max_tokens (int): Maximum tokens to generate
289
+ temperature (float): Temperature for sampling
290
+ top_p (float): Top-p sampling parameter
291
+ stop (list): List of stop sequences
292
+ cancel_event (threading.Event): Event to signal cancellation
293
+ progress_callback (callable): Function to report progress
294
+ model_path (str): Path to GGUF model file (optional)
295
+
296
+ Returns:
297
+ str: Generated response
298
+ """
299
+ if progress_callback:
300
+ progress_callback(0.1, "Loading llama.cpp model...")
301
+
302
+ # Load model
303
+ try:
304
+ model = load_llamacpp_model(model_path)
305
+ except Exception as e:
306
+ raise RuntimeError(f"Failed to load llama.cpp model: {e}")
307
+
308
+ if progress_callback:
309
+ progress_callback(0.3, "Starting generation...")
310
+
311
+ # Format prompt
312
+ prompt = f"You are a helpful pharmaceutical assistant. Please answer this question about medications or medical topics.\n\nQuestion: {query}\n\nAnswer:"
313
+
314
+ # Define stop sequences if not provided
315
+ if stop is None:
316
+ stop = ["Question:", "\n\n"]
317
+
318
+ try:
319
+ # Check if create_completion method exists (newer versions)
320
+ if hasattr(model, "create_completion"):
321
+ # Stream response
322
+ response_text = ""
323
+
324
+ # Generate completion with streaming
325
+ stream = model.create_completion(
326
+ prompt,
327
+ max_tokens=1024,
328
+ temperature=temperature,
329
+ top_p=top_p,
330
+ top_k=40,
331
+ stop=None,
332
+ stream=True
333
+ )
334
+
335
+ # Process stream
336
+ for i, chunk in enumerate(stream):
337
+ if cancel_event and cancel_event.is_set():
338
+ break
339
+
340
+ text_chunk = chunk["choices"][0]["text"]
341
+ response_text += text_chunk
342
+
343
+ # Update progress periodically
344
+ if progress_callback and i % 5 == 0:
345
+ progress_callback(0.4 + min(0.5, len(response_text) / 500), "Generating response...")
346
+
347
+ return response_text.strip()
348
+ else:
349
+ # Fallback to older call method
350
+ result = model(
351
+ prompt,
352
+ max_tokens=max_tokens,
353
+ temperature=temperature,
354
+ top_p=top_p,
355
+ top_k=40,
356
+ stop=stop,
357
+ echo=False
358
+ )
359
+
360
+ if progress_callback:
361
+ progress_callback(0.9, "Finalizing...")
362
+
363
+ return result["choices"][0]["text"].strip()
364
+
365
+ except Exception as e:
366
  raise RuntimeError(f"Error in llama.cpp generation: {e}")