Spaces:

Ruurd
/

radiolm

Sleeping

Ruurd commited on Apr 16

Commit

2ed78b7

1 Parent(s): 55c85d9

Try to fix

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import threading
 from transformers import TextIteratorStreamer
 import queue
 class RichTextStreamer(TextIteratorStreamer):
     def __init__(self, tokenizer, **kwargs):
         super().__init__(tokenizer, **kwargs)
@@ -74,7 +75,7 @@ def chat_with_model(messages):
         pad_token_id=pad_id
     )
-    thread = threading.Thread(target=current_model.generate, kwargs=generation_kwargs)
     thread.start()
     output_text = ""
@@ -87,11 +88,10 @@ def chat_with_model(messages):
         token_id = token_info["token_id"]
         is_special = token_info["is_special"]
-        # Skip appending the EOS token to output
         if token_id == current_tokenizer.eos_token_id:
             break
-        # Detect reasoning block
         if "<think>" in token_str:
             in_think = True
             token_str = token_str.replace("<think>", "")
@@ -112,6 +112,9 @@ def chat_with_model(messages):
         messages[-1]["content"] = output_text
         yield messages
     current_model.to("cpu")
     torch.cuda.empty_cache()

 from transformers import TextIteratorStreamer
 import queue
+@spaces.GPU
 class RichTextStreamer(TextIteratorStreamer):
     def __init__(self, tokenizer, **kwargs):
         super().__init__(tokenizer, **kwargs)
         pad_token_id=pad_id
     )
+     thread = threading.Thread(target=current_model.generate, kwargs=generation_kwargs)
     thread.start()
     output_text = ""
         token_id = token_info["token_id"]
         is_special = token_info["is_special"]
         if token_id == current_tokenizer.eos_token_id:
+            streamer.end_of_generation.set()  # signal to stop generation thread
             break
         if "<think>" in token_str:
             in_think = True
             token_str = token_str.replace("<think>", "")
         messages[-1]["content"] = output_text
         yield messages
+    # Ensure generation thread stops
+    thread.join(timeout=1.0)
     current_model.to("cpu")
     torch.cuda.empty_cache()