Spaces:

Metal3d
/

reasoning-deepseek-qwen2

Running on Zero

App Files Files Community

Metal3d commited on Mar 20

Commit

ca9eb6e

unverified ·

1 Parent(s): 181e1d1

Another try with asyncio

Browse files

Files changed (1) hide show

main.py +49 -32

main.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import re
-import threading
 import gradio as gr
 import spaces
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 JS = """
 () => {
@@ -57,29 +58,29 @@ def reformat_math(text):
 @spaces.GPU
-def generate(messages):
     text = tokenizer.apply_chat_template(
-        messages,
         tokenize=False,
         add_generation_prompt=True,
     )
     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
-    threading.Thread(
-        target=model.generate,
-        kwargs={
-            "max_new_tokens": 1024 * 128,
-            "streamer": streamer,
             **model_inputs,
-        },
-    ).start()
-    return streamer
-def chat(prompt, history):
     """Respond to a chat prompt."""
     message = {
         "role": "user",
@@ -90,29 +91,44 @@ def chat(prompt, history):
     history = [] if history is None else history
     message_list = history + [message]
-    # get the task and the streamer
-    streamer = generate(message_list)
     buffer = ""
     reasoning = ""
     thinking = False
-    for new_text in streamer:
-        if not thinking and "<think>" in new_text:
-            thinking = True
-            continue
-        if thinking and "</think>" in new_text:
-            thinking = False
-            continue
-        if thinking:
-            reasoning += new_text
-            heading = "# Reasoning\n\n"
-            yield "I'm thinking, please wait a moment...", heading + reasoning
-            continue
-        buffer += new_text
-        yield reformat_math(buffer), reasoning
 chat_bot = gr.Chatbot(
@@ -124,6 +140,7 @@ chat_bot = gr.Chatbot(
     type="messages",
 )
 with gr.Blocks(js=JS) as demo:
     reasoning = gr.Markdown(
         "# Reasoning\n\nWhen the model will reasoning, its thoughts will be displayed here.",

+import asyncio
+import functools
 import re
 import gradio as gr
 import spaces
+from transformers import AsyncTextIteratorStreamer, AutoModelForCausalLM, AutoTokenizer
 JS = """
 () => {
 @spaces.GPU
+def _generate(history):
     text = tokenizer.apply_chat_template(
+        history,
         tokenize=False,
         add_generation_prompt=True,
     )
     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    streamer = AsyncTextIteratorStreamer(tokenizer, skip_special_tokens=True)
+    task = asyncio.get_running_loop().run_in_executor(
+        None,
+        functools.partial(
+            model.generate,
+            max_new_tokens=1024 * 128,
+            streamer=streamer,
             **model_inputs,
+        ),
+    )
+    return task, streamer
+async def chat(prompt, history):
     """Respond to a chat prompt."""
     message = {
         "role": "user",
     history = [] if history is None else history
     message_list = history + [message]
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    task, streamer = _generate(message_list)
     buffer = ""
     reasoning = ""
     thinking = False
+    try:
+        async for new_text in streamer:
+            if task.done() or task.cancelled():
+                print("Cancelled")
+                break  # Stop le streaming si la tâche est annulée
+            if not thinking and "<think>" in new_text:
+                thinking = True
+                continue
+            if thinking and "</think>" in new_text:
+                thinking = False
+                continue
+            if thinking:
+                reasoning += new_text
+                heading = "# Reasoning\n\n"
+                yield "I'm thinking, please wait a moment...", heading + reasoning
+                continue
+            buffer += new_text
+            yield reformat_math(buffer), reasoning
+    except asyncio.CancelledError:
+        # this doesn't work, I don't find a way to stop generation thread
+        print("Cancelled")
+        streamer.on_finalized_text("cancelled", True)
+        print("Signal sent")
+        raise
+    loop.close()
 chat_bot = gr.Chatbot(
     type="messages",
 )
 with gr.Blocks(js=JS) as demo:
     reasoning = gr.Markdown(
         "# Reasoning\n\nWhen the model will reasoning, its thoughts will be displayed here.",