Metal3d commited on
Commit
181e1d1
·
unverified ·
1 Parent(s): 48a12f0

Remove asyncio :(

Browse files
Files changed (1) hide show
  1. main.py +33 -46
main.py CHANGED
@@ -1,10 +1,9 @@
1
- import asyncio
2
- import functools
3
  import re
 
4
 
5
  import gradio as gr
6
  import spaces
7
- from transformers import AsyncTextIteratorStreamer, AutoModelForCausalLM, AutoTokenizer
8
 
9
  JS = """
10
  () => {
@@ -58,29 +57,29 @@ def reformat_math(text):
58
 
59
 
60
  @spaces.GPU
61
- def _generate(history):
62
  text = tokenizer.apply_chat_template(
63
- history,
64
  tokenize=False,
65
  add_generation_prompt=True,
66
  )
67
 
68
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
69
- streamer = AsyncTextIteratorStreamer(tokenizer, skip_special_tokens=True)
70
-
71
- task = asyncio.get_running_loop().run_in_executor(
72
- None,
73
- functools.partial(
74
- model.generate,
75
- max_new_tokens=1024 * 128,
76
- streamer=streamer,
77
  **model_inputs,
78
- ),
79
- )
80
- return task, streamer
 
81
 
82
 
83
- async def chat(prompt, history):
84
  """Respond to a chat prompt."""
85
  message = {
86
  "role": "user",
@@ -92,40 +91,28 @@ async def chat(prompt, history):
92
  message_list = history + [message]
93
 
94
  # get the task and the streamer
95
- task, streamer = _generate(message_list)
96
 
97
  buffer = ""
98
  reasoning = ""
99
  thinking = False
100
 
101
- try:
102
- async for new_text in streamer:
103
- if task.done() or task.cancelled():
104
- print("Cancelled")
105
- break # Stop le streaming si la tâche est annulée
106
-
107
- if not thinking and "<think>" in new_text:
108
- thinking = True
109
- continue
110
- if thinking and "</think>" in new_text:
111
- thinking = False
112
- continue
113
-
114
- if thinking:
115
- reasoning += new_text
116
- heading = "# Reasoning\n\n"
117
- yield "I'm thinking, please wait a moment...", heading + reasoning
118
- continue
119
-
120
- buffer += new_text
121
- yield reformat_math(buffer), reasoning
122
-
123
- except asyncio.CancelledError:
124
- # this doesn't work, I don't find a way to stop generation thread
125
- print("Cancelled")
126
- streamer.on_finalized_text("cancelled", True)
127
- print("Signal sent")
128
- raise
129
 
130
 
131
  chat_bot = gr.Chatbot(
 
 
 
1
  import re
2
+ import threading
3
 
4
  import gradio as gr
5
  import spaces
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
7
 
8
  JS = """
9
  () => {
 
57
 
58
 
59
  @spaces.GPU
60
+ def generate(messages):
61
  text = tokenizer.apply_chat_template(
62
+ messages,
63
  tokenize=False,
64
  add_generation_prompt=True,
65
  )
66
 
67
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
68
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
69
+
70
+ threading.Thread(
71
+ target=model.generate,
72
+ kwargs={
73
+ "max_new_tokens": 1024 * 128,
74
+ "streamer": streamer,
 
75
  **model_inputs,
76
+ },
77
+ ).start()
78
+
79
+ return streamer
80
 
81
 
82
+ def chat(prompt, history):
83
  """Respond to a chat prompt."""
84
  message = {
85
  "role": "user",
 
91
  message_list = history + [message]
92
 
93
  # get the task and the streamer
94
+ streamer = generate(message_list)
95
 
96
  buffer = ""
97
  reasoning = ""
98
  thinking = False
99
 
100
+ for new_text in streamer:
101
+ if not thinking and "<think>" in new_text:
102
+ thinking = True
103
+ continue
104
+ if thinking and "</think>" in new_text:
105
+ thinking = False
106
+ continue
107
+
108
+ if thinking:
109
+ reasoning += new_text
110
+ heading = "# Reasoning\n\n"
111
+ yield "I'm thinking, please wait a moment...", heading + reasoning
112
+ continue
113
+
114
+ buffer += new_text
115
+ yield reformat_math(buffer), reasoning
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
 
118
  chat_bot = gr.Chatbot(