Locutusque commited on
Commit
f055cf8
·
verified ·
1 Parent(s): a2fb289

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -188
app.py CHANGED
@@ -1,133 +1,13 @@
1
  import spaces
2
  import gradio as gr
3
- from transformers import pipeline, AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM
4
  import torch
5
- from threading import Thread, Lock, Event
6
  import os
7
- import asyncio
8
- import time
9
- from datetime import datetime
10
- import gc
11
-
12
- # Global dictionary to store preloaded models and tokenizers
13
- LOADED_MODELS = {}
14
- LOADED_TOKENIZERS = {}
15
- # Lock for thread-safe model access
16
- MODEL_LOCK = Lock()
17
- # Event to signal shutdown
18
- SHUTDOWN_EVENT = Event()
19
-
20
- def clear_memory():
21
- """Clear GPU and CPU memory"""
22
- torch.cuda.empty_cache()
23
- gc.collect()
24
-
25
- def load_single_model(model_name):
26
- """Load a single model and tokenizer"""
27
- try:
28
- print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Loading {model_name}...")
29
-
30
- # Load model to CPU with bfloat16 to save memory
31
- model = AutoModelForCausalLM.from_pretrained(
32
- model_name,
33
- torch_dtype=torch.bfloat16,
34
- trust_remote_code=True,
35
- token=os.environ.get("token"),
36
- )
37
-
38
- # Load tokenizer
39
- tokenizer = AutoTokenizer.from_pretrained(
40
- model_name,
41
- trust_remote_code=True,
42
- token=os.environ.get("token")
43
- )
44
- tokenizer.eos_token = "<|im_end|>"
45
-
46
- print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Successfully loaded {model_name}")
47
- return model, tokenizer
48
- except Exception as e:
49
- print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Failed to load {model_name}: {e}")
50
- return None, None
51
-
52
- def preload_models(model_choices):
53
- """Preload all models to CPU at startup"""
54
- print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Preloading models to CPU...")
55
-
56
- with MODEL_LOCK:
57
- for model_name in model_choices:
58
- model, tokenizer = load_single_model(model_name)
59
- if model is not None and tokenizer is not None:
60
- LOADED_MODELS[model_name] = model
61
- LOADED_TOKENIZERS[model_name] = tokenizer
62
-
63
- def reload_models_task(model_choices):
64
- """Background task to reload models every 15 minutes"""
65
- print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Starting model reload task...")
66
-
67
- while not SHUTDOWN_EVENT.is_set():
68
- # Wait for 15 minutes (900 seconds)
69
- if SHUTDOWN_EVENT.wait(240):
70
- # If event is set, exit the loop
71
- break
72
-
73
- print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Starting periodic model reload...")
74
-
75
- # Create temporary dictionaries for new models
76
- new_models = {}
77
- new_tokenizers = {}
78
-
79
- # Load new models
80
- for model_name in model_choices:
81
- model, tokenizer = load_single_model(model_name)
82
- if model is not None and tokenizer is not None:
83
- new_models[model_name] = model
84
- new_tokenizers[model_name] = tokenizer
85
-
86
- # Replace old models with new ones atomically
87
- with MODEL_LOCK:
88
- # Store old models for cleanup
89
- old_models = LOADED_MODELS.copy()
90
- old_tokenizers = LOADED_TOKENIZERS.copy()
91
-
92
- # Clear the dictionaries
93
- LOADED_MODELS.clear()
94
- LOADED_TOKENIZERS.clear()
95
-
96
- # Update with new models
97
- LOADED_MODELS.update(new_models)
98
- LOADED_TOKENIZERS.update(new_tokenizers)
99
-
100
- # Delete old model references
101
- del old_models
102
- del old_tokenizers
103
-
104
- # Clear memory
105
- clear_memory()
106
-
107
- print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Model reload completed")
108
 
109
  @spaces.GPU()
110
- def get_model_pipeline(model_name):
111
- """Move selected model to GPU and create pipeline"""
112
- with MODEL_LOCK:
113
- if model_name not in LOADED_MODELS:
114
- raise ValueError(f"Model {model_name} not found in preloaded models")
115
-
116
- # Get model and tokenizer references
117
- model = LOADED_MODELS[model_name]
118
- tokenizer = LOADED_TOKENIZERS[model_name]
119
-
120
- # Create pipeline with the GPU model
121
- pipe = pipeline(
122
- "text-generation",
123
- model=model,
124
- tokenizer=tokenizer,
125
- torch_dtype=torch.bfloat16,
126
- device="cuda"
127
- )
128
-
129
- return pipe, model
130
-
131
  @spaces.GPU(duration=45)
132
  def generate(
133
  message,
@@ -141,32 +21,27 @@ def generate(
141
  max_new_tokens=256,
142
  ):
143
  try:
144
- # Get the pipeline with model on GPU
145
- pipe, gpu_model = get_model_pipeline(model_name)
146
-
147
- # Build the prompt
 
148
  prompt = f"<|im_start|>system\n{system}<|im_end|>\n"
149
  for (user_turn, assistant_turn) in history:
150
  prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n"
151
  prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
152
 
153
- streamer = TextIteratorStreamer(
154
- pipe.tokenizer,
155
- timeout=240.0,
156
- skip_prompt=True,
157
- skip_special_tokens=True
158
- )
159
-
160
  generation_kwargs = dict(
161
- text_inputs=prompt,
162
- streamer=streamer,
163
- max_new_tokens=max_new_tokens,
164
- do_sample=True,
165
- top_p=top_p,
166
- min_p=min_p,
167
- top_k=top_k,
168
- temperature=temperature,
169
- num_beams=1,
170
  repetition_penalty=1.1
171
  )
172
 
@@ -177,65 +52,28 @@ def generate(
177
  for chunk in streamer:
178
  outputs.append(chunk)
179
  yield "".join(outputs)
180
-
181
- # Move model back to CPU after inference to free GPU memory
182
- gpu_model.to("cpu")
183
- torch.cuda.empty_cache()
184
-
185
  except StopAsyncIteration:
186
  print("Stream stopped unexpectedly.")
187
  yield "".join(outputs)
188
  except Exception as e:
189
  print(f"An error occurred: {e}")
190
- yield f"An error occurred during generation: {str(e)}"
191
- finally:
192
- # Ensure model is moved back to CPU even if there's an error
193
- if 'gpu_model' in locals():
194
- gpu_model.to("cpu")
195
- torch.cuda.empty_cache()
196
-
197
- # Define model choices
198
- model_choices = [
199
- "Locutusque/Hercules-6.1-Llama-3.1-8B",
200
- "Locutusque/liberalis-cogitator-mistral-nemo-2407",
201
- "Locutusque/lmsys-llama-3.2-3b"
202
- ]
203
-
204
- # Preload all models to CPU at startup
205
- preload_models(model_choices)
206
 
207
- # Start the background reload task
208
- reload_thread = Thread(target=reload_models_task, args=(model_choices,), daemon=True)
209
- reload_thread.start()
210
-
211
- # Create Gradio interface
212
  g = gr.ChatInterface(
213
  fn=generate,
214
  additional_inputs=[
215
- gr.components.Dropdown(
216
- choices=model_choices,
217
- label="Model",
218
- value=model_choices[0],
219
- interactive=True
220
- ),
221
- gr.components.Textbox(
222
- lines=2,
223
- label="System Prompt",
224
- value="You are a sentient, super intelligent AI developed by a man named Locutusque."
225
- ),
226
  gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
227
  gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
228
  gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
229
  gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
230
- gr.components.Slider(minimum=1, maximum=8192, step=1, value=1024, label="Max tokens"),
231
  ],
232
  title="Locutusque's Language Models",
233
  description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
234
  )
235
-
236
  if __name__ == "__main__":
237
- try:
238
- g.launch()
239
- finally:
240
- # Signal the reload thread to stop when the app shuts down
241
- SHUTDOWN_EVENT.set()
 
1
  import spaces
2
  import gradio as gr
3
+ from transformers import pipeline, AutoTokenizer, TextIteratorStreamer
4
  import torch
5
+ from threading import Thread
6
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  @spaces.GPU()
9
+ def load_model(model_name):
10
+ return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"], use_fast=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  @spaces.GPU(duration=45)
12
  def generate(
13
  message,
 
21
  max_new_tokens=256,
22
  ):
23
  try:
24
+ pipe = load_model(model_name)
25
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=os.environ["token"])
26
+ tokenizer.eos_token = "<|im_end|>"
27
+ print(tokenizer)
28
+ pipe.tokenizer = tokenizer
29
  prompt = f"<|im_start|>system\n{system}<|im_end|>\n"
30
  for (user_turn, assistant_turn) in history:
31
  prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n"
32
  prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
33
 
34
+ streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True)
 
 
 
 
 
 
35
  generation_kwargs = dict(
36
+ text_inputs=prompt,
37
+ streamer=streamer,
38
+ max_new_tokens=max_new_tokens,
39
+ do_sample=True,
40
+ top_p=top_p,
41
+ min_p=min_p,
42
+ top_k=top_k,
43
+ temperature=temperature,
44
+ num_beams=1,
45
  repetition_penalty=1.1
46
  )
47
 
 
52
  for chunk in streamer:
53
  outputs.append(chunk)
54
  yield "".join(outputs)
 
 
 
 
 
55
  except StopAsyncIteration:
56
  print("Stream stopped unexpectedly.")
57
  yield "".join(outputs)
58
  except Exception as e:
59
  print(f"An error occurred: {e}")
60
+ yield "An error occurred during generation."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ model_choices = ["Locutusque/Hercules-6.1-Llama-3.1-8B", "Locutusque/liberalis-cogitator-mistral-nemo-2407", "Locutusque/Hercules-6.9-Llama-3.1-8B", "Locutusque/lmsys-llama-3.2-3b", "Locutusque/CollectiveLM-Falcon-3-7B", "Locutusque/StockQwen-2.5-7B"]
63
+ # What at the best options?
 
 
 
64
  g = gr.ChatInterface(
65
  fn=generate,
66
  additional_inputs=[
67
+ gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True),
68
+ gr.components.Textbox(lines=2, label="System Prompt", value="You are a sentient, super intelligent AI developed by a man named Locutusque."),
 
 
 
 
 
 
 
 
 
69
  gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
70
  gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
71
  gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
72
  gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
73
+ gr.components.Slider(minimum=1, maximum=8192, step=1, value=1024, label="Max tokens"),
74
  ],
75
  title="Locutusque's Language Models",
76
  description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
77
  )
 
78
  if __name__ == "__main__":
79
+ g.launch()