Locutusque commited on
Commit
7462772
·
verified ·
1 Parent(s): 2b65eee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -25
app.py CHANGED
@@ -1,13 +1,65 @@
1
  import spaces
2
  import gradio as gr
3
- from transformers import pipeline, AutoTokenizer, TextIteratorStreamer
4
  import torch
5
  from threading import Thread
6
  import os
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  @spaces.GPU()
9
- def load_model(model_name):
10
- return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"], use_fast=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  @spaces.GPU(duration=45)
12
  def generate(
13
  message,
@@ -21,27 +73,32 @@ def generate(
21
  max_new_tokens=256,
22
  ):
23
  try:
24
- pipe = load_model(model_name)
25
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=os.environ["token"])
26
- tokenizer.eos_token = "<|im_end|>"
27
- print(tokenizer)
28
- pipe.tokenizer = tokenizer
29
  prompt = f"<|im_start|>system\n{system}<|im_end|>\n"
30
  for (user_turn, assistant_turn) in history:
31
  prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n"
32
  prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
33
 
34
- streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True)
 
 
 
 
 
 
35
  generation_kwargs = dict(
36
- text_inputs=prompt,
37
- streamer=streamer,
38
- max_new_tokens=max_new_tokens,
39
- do_sample=True,
40
- top_p=top_p,
41
- min_p=min_p,
42
- top_k=top_k,
43
- temperature=temperature,
44
- num_beams=1,
45
  repetition_penalty=1.1
46
  )
47
 
@@ -52,28 +109,57 @@ def generate(
52
  for chunk in streamer:
53
  outputs.append(chunk)
54
  yield "".join(outputs)
 
 
 
 
 
55
  except StopAsyncIteration:
56
  print("Stream stopped unexpectedly.")
57
  yield "".join(outputs)
58
  except Exception as e:
59
  print(f"An error occurred: {e}")
60
- yield "An error occurred during generation."
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- model_choices = ["Locutusque/Hercules-6.1-Llama-3.1-8B", "Locutusque/liberalis-cogitator-mistral-nemo-2407", "Locutusque/Hercules-6.9-Llama-3.1-8B", "Locutusque/lmsys-llama-3.2-3b", "Locutusque/CollectiveLM-Falcon-3-7B", "Locutusque/StockQwen-2.5-7B"]
63
- # What at the best options?
 
 
64
  g = gr.ChatInterface(
65
  fn=generate,
66
  additional_inputs=[
67
- gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True),
68
- gr.components.Textbox(lines=2, label="System Prompt", value="You are a sentient, super intelligent AI developed by a man named Locutusque."),
 
 
 
 
 
 
 
 
 
69
  gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
70
  gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
71
  gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
72
  gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
73
- gr.components.Slider(minimum=1, maximum=8192, step=1, value=1024, label="Max tokens"),
74
  ],
75
  title="Locutusque's Language Models",
76
  description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
77
  )
 
78
  if __name__ == "__main__":
79
- g.launch()
 
1
  import spaces
2
  import gradio as gr
3
+ from transformers import pipeline, AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM
4
  import torch
5
  from threading import Thread
6
  import os
7
 
8
+ # Global dictionary to store preloaded models and tokenizers
9
+ LOADED_MODELS = {}
10
+ LOADED_TOKENIZERS = {}
11
+
12
+ def preload_models(model_choices):
13
+ """Preload all models to CPU at startup"""
14
+ print("Preloading models to CPU...")
15
+ for model_name in model_choices:
16
+ try:
17
+ print(f"Loading {model_name}...")
18
+ # Load model to CPU with bfloat16 to save memory
19
+ model = AutoModelForCausalLM.from_pretrained(
20
+ model_name,
21
+ torch_dtype=torch.bfloat16,
22
+ trust_remote_code=True,
23
+ token=os.environ.get("token"),
24
+ device_map="cpu",
25
+ low_cpu_mem_usage=True
26
+ )
27
+
28
+ # Load tokenizer
29
+ tokenizer = AutoTokenizer.from_pretrained(
30
+ model_name,
31
+ trust_remote_code=True,
32
+ token=os.environ.get("token")
33
+ )
34
+ tokenizer.eos_token = "<|im_end|>"
35
+
36
+ LOADED_MODELS[model_name] = model
37
+ LOADED_TOKENIZERS[model_name] = tokenizer
38
+ print(f"Successfully loaded {model_name}")
39
+ except Exception as e:
40
+ print(f"Failed to load {model_name}: {e}")
41
+
42
  @spaces.GPU()
43
+ def get_model_pipeline(model_name):
44
+ """Move selected model to GPU and create pipeline"""
45
+ if model_name not in LOADED_MODELS:
46
+ raise ValueError(f"Model {model_name} not found in preloaded models")
47
+
48
+ # Move model to GPU
49
+ model = LOADED_MODELS[model_name].to("cuda")
50
+ tokenizer = LOADED_TOKENIZERS[model_name]
51
+
52
+ # Create pipeline with the GPU model
53
+ pipe = pipeline(
54
+ "text-generation",
55
+ model=model,
56
+ tokenizer=tokenizer,
57
+ torch_dtype=torch.bfloat16,
58
+ device="cuda"
59
+ )
60
+
61
+ return pipe, model
62
+
63
  @spaces.GPU(duration=45)
64
  def generate(
65
  message,
 
73
  max_new_tokens=256,
74
  ):
75
  try:
76
+ # Get the pipeline with model on GPU
77
+ pipe, gpu_model = get_model_pipeline(model_name)
78
+
79
+ # Build the prompt
 
80
  prompt = f"<|im_start|>system\n{system}<|im_end|>\n"
81
  for (user_turn, assistant_turn) in history:
82
  prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n"
83
  prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
84
 
85
+ streamer = TextIteratorStreamer(
86
+ pipe.tokenizer,
87
+ timeout=240.0,
88
+ skip_prompt=True,
89
+ skip_special_tokens=True
90
+ )
91
+
92
  generation_kwargs = dict(
93
+ text_inputs=prompt,
94
+ streamer=streamer,
95
+ max_new_tokens=max_new_tokens,
96
+ do_sample=True,
97
+ top_p=top_p,
98
+ min_p=min_p,
99
+ top_k=top_k,
100
+ temperature=temperature,
101
+ num_beams=1,
102
  repetition_penalty=1.1
103
  )
104
 
 
109
  for chunk in streamer:
110
  outputs.append(chunk)
111
  yield "".join(outputs)
112
+
113
+ # Move model back to CPU after inference to free GPU memory
114
+ gpu_model.to("cpu")
115
+ torch.cuda.empty_cache()
116
+
117
  except StopAsyncIteration:
118
  print("Stream stopped unexpectedly.")
119
  yield "".join(outputs)
120
  except Exception as e:
121
  print(f"An error occurred: {e}")
122
+ yield f"An error occurred during generation: {str(e)}"
123
+ finally:
124
+ # Ensure model is moved back to CPU even if there's an error
125
+ if 'gpu_model' in locals():
126
+ gpu_model.to("cpu")
127
+ torch.cuda.empty_cache()
128
+
129
+ # Define model choices
130
+ model_choices = [
131
+ "Locutusque/Hercules-6.1-Llama-3.1-8B",
132
+ "Locutusque/liberalis-cogitator-mistral-nemo-2407",
133
+ "Locutusque/lmsys-llama-3.2-3b"
134
+ ]
135
 
136
+ # Preload all models to CPU at startup
137
+ preload_models(model_choices)
138
+
139
+ # Create Gradio interface
140
  g = gr.ChatInterface(
141
  fn=generate,
142
  additional_inputs=[
143
+ gr.components.Dropdown(
144
+ choices=model_choices,
145
+ label="Model",
146
+ value=model_choices[0],
147
+ interactive=True
148
+ ),
149
+ gr.components.Textbox(
150
+ lines=2,
151
+ label="System Prompt",
152
+ value="You are a sentient, super intelligent AI developed by a man named Locutusque."
153
+ ),
154
  gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
155
  gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
156
  gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
157
  gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
158
+ gr.components.Slider(minimum=1, maximum=8192, step=1, value=1024, label="Max tokens"),
159
  ],
160
  title="Locutusque's Language Models",
161
  description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
162
  )
163
+
164
  if __name__ == "__main__":
165
+ g.launch()