Metal3d commited on
Commit
ca9eb6e
·
unverified ·
1 Parent(s): 181e1d1

Another try with asyncio

Browse files
Files changed (1) hide show
  1. main.py +49 -32
main.py CHANGED
@@ -1,9 +1,10 @@
 
 
1
  import re
2
- import threading
3
 
4
  import gradio as gr
5
  import spaces
6
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
7
 
8
  JS = """
9
  () => {
@@ -57,29 +58,29 @@ def reformat_math(text):
57
 
58
 
59
  @spaces.GPU
60
- def generate(messages):
61
  text = tokenizer.apply_chat_template(
62
- messages,
63
  tokenize=False,
64
  add_generation_prompt=True,
65
  )
66
 
67
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
68
- streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
69
-
70
- threading.Thread(
71
- target=model.generate,
72
- kwargs={
73
- "max_new_tokens": 1024 * 128,
74
- "streamer": streamer,
 
75
  **model_inputs,
76
- },
77
- ).start()
78
-
79
- return streamer
80
 
81
 
82
- def chat(prompt, history):
83
  """Respond to a chat prompt."""
84
  message = {
85
  "role": "user",
@@ -90,29 +91,44 @@ def chat(prompt, history):
90
  history = [] if history is None else history
91
  message_list = history + [message]
92
 
93
- # get the task and the streamer
94
- streamer = generate(message_list)
 
95
 
96
  buffer = ""
97
  reasoning = ""
98
  thinking = False
99
 
100
- for new_text in streamer:
101
- if not thinking and "<think>" in new_text:
102
- thinking = True
103
- continue
104
- if thinking and "</think>" in new_text:
105
- thinking = False
106
- continue
 
 
 
 
 
107
 
108
- if thinking:
109
- reasoning += new_text
110
- heading = "# Reasoning\n\n"
111
- yield "I'm thinking, please wait a moment...", heading + reasoning
112
- continue
113
 
114
- buffer += new_text
115
- yield reformat_math(buffer), reasoning
 
 
 
 
 
 
 
 
 
116
 
117
 
118
  chat_bot = gr.Chatbot(
@@ -124,6 +140,7 @@ chat_bot = gr.Chatbot(
124
  type="messages",
125
  )
126
 
 
127
  with gr.Blocks(js=JS) as demo:
128
  reasoning = gr.Markdown(
129
  "# Reasoning\n\nWhen the model will reasoning, its thoughts will be displayed here.",
 
1
+ import asyncio
2
+ import functools
3
  import re
 
4
 
5
  import gradio as gr
6
  import spaces
7
+ from transformers import AsyncTextIteratorStreamer, AutoModelForCausalLM, AutoTokenizer
8
 
9
  JS = """
10
  () => {
 
58
 
59
 
60
  @spaces.GPU
61
+ def _generate(history):
62
  text = tokenizer.apply_chat_template(
63
+ history,
64
  tokenize=False,
65
  add_generation_prompt=True,
66
  )
67
 
68
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
69
+ streamer = AsyncTextIteratorStreamer(tokenizer, skip_special_tokens=True)
70
+
71
+ task = asyncio.get_running_loop().run_in_executor(
72
+ None,
73
+ functools.partial(
74
+ model.generate,
75
+ max_new_tokens=1024 * 128,
76
+ streamer=streamer,
77
  **model_inputs,
78
+ ),
79
+ )
80
+ return task, streamer
 
81
 
82
 
83
+ async def chat(prompt, history):
84
  """Respond to a chat prompt."""
85
  message = {
86
  "role": "user",
 
91
  history = [] if history is None else history
92
  message_list = history + [message]
93
 
94
+ loop = asyncio.new_event_loop()
95
+ asyncio.set_event_loop(loop)
96
+ task, streamer = _generate(message_list)
97
 
98
  buffer = ""
99
  reasoning = ""
100
  thinking = False
101
 
102
+ try:
103
+ async for new_text in streamer:
104
+ if task.done() or task.cancelled():
105
+ print("Cancelled")
106
+ break # Stop le streaming si la tâche est annulée
107
+
108
+ if not thinking and "<think>" in new_text:
109
+ thinking = True
110
+ continue
111
+ if thinking and "</think>" in new_text:
112
+ thinking = False
113
+ continue
114
 
115
+ if thinking:
116
+ reasoning += new_text
117
+ heading = "# Reasoning\n\n"
118
+ yield "I'm thinking, please wait a moment...", heading + reasoning
119
+ continue
120
 
121
+ buffer += new_text
122
+ yield reformat_math(buffer), reasoning
123
+
124
+ except asyncio.CancelledError:
125
+ # this doesn't work, I don't find a way to stop generation thread
126
+ print("Cancelled")
127
+ streamer.on_finalized_text("cancelled", True)
128
+ print("Signal sent")
129
+ raise
130
+
131
+ loop.close()
132
 
133
 
134
  chat_bot = gr.Chatbot(
 
140
  type="messages",
141
  )
142
 
143
+
144
  with gr.Blocks(js=JS) as demo:
145
  reasoning = gr.Markdown(
146
  "# Reasoning\n\nWhen the model will reasoning, its thoughts will be displayed here.",