Spaces:

rathore11
/

PY_LLM_NEW

Paused

dharmendra commited on Jul 13

Commit

48d0a68

1 Parent(s): 20960a5

Implement streaming responses for LLM API

Files changed (1) hide show

app.py CHANGED Viewed

@@ -69,6 +69,7 @@ async def generate_text(request: QuestionRequest):
             response_stream = conversation.stream({"input": request.question})
             for chunk in response_stream:
                 # Each chunk is typically a dictionary with a 'content' key
                 # We want to send just the new token/text back.
                 # Ensure the chunk is stringified and followed by a newline for client parsing.
@@ -76,12 +77,17 @@ async def generate_text(request: QuestionRequest):
                 # yield f"data: {json.dumps({'token': chunk.content})}\n\n"
                 # For simplicity, we'll just yield the content directly for now.
                 if 'response' in chunk:
-                    yield chunk['response']
                 else:
-                    yield str(chunk)
                 await asyncio.sleep(0.01) # Small delay to allow client to process chunks
         except Exception as e:
             print("Error during streaming generation:")
             traceback.print_exc()

             response_stream = conversation.stream({"input": request.question})
             for chunk in response_stream:
+                token_content = ""
                 # Each chunk is typically a dictionary with a 'content' key
                 # We want to send just the new token/text back.
                 # Ensure the chunk is stringified and followed by a newline for client parsing.
                 # yield f"data: {json.dumps({'token': chunk.content})}\n\n"
                 # For simplicity, we'll just yield the content directly for now.
                 if 'response' in chunk:
+                    token_content= chunk['response']
                 else:
+                    token_content= str(chunk)
+                yield json.dumps({"content":token_content}) +"\n"
                 await asyncio.sleep(0.01) # Small delay to allow client to process chunks
+            #optionally send final end msg
+            yield json.dumps({"status":"completed"}) +"\n"
         except Exception as e:
             print("Error during streaming generation:")
             traceback.print_exc()