dharmendra commited on
Commit
a05ac69
·
1 Parent(s): 58966a1

Implement streaming responses for LLM API

Browse files
Files changed (1) hide show
  1. app.py +27 -7
app.py CHANGED
@@ -60,13 +60,33 @@ class ChatResponse(BaseModel):
60
 
61
  @app.post("/api/generate")
62
  async def generate_text(request: QuestionRequest):
63
- try:
64
- response = conversation.predict(input=request.question)
65
- return {"response": response}
66
- except Exception as e:
67
- print("Error during generation:")
68
- traceback.print_exc()
69
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
 
72
  # below when not using langchain fully
 
60
 
61
  @app.post("/api/generate")
62
  async def generate_text(request: QuestionRequest):
63
+ async def generate_stream():
64
+ try:
65
+ # Use LangChain's .stream() method for token-by-token generation
66
+ # This will yield chunks of the response as they are produced
67
+ response_stream = conversation.stream({"input": request.question})
68
+
69
+ for chunk in response_stream:
70
+ # Each chunk is typically a dictionary with a 'content' key
71
+ # We want to send just the new token/text back.
72
+ # Ensure the chunk is stringified and followed by a newline for client parsing.
73
+ # For more robust streaming, consider Server-Sent Events (SSE) format:
74
+ # yield f"data: {json.dumps({'token': chunk.content})}\n\n"
75
+ # For simplicity, we'll just yield the content directly for now.
76
+ yield chunk.content
77
+ await asyncio.sleep(0.01) # Small delay to allow client to process chunks
78
+
79
+ except Exception as e:
80
+ print("Error during streaming generation:")
81
+ traceback.print_exc()
82
+ # You might want to yield an error message to the client here
83
+ yield f"ERROR: {str(e)}\n"
84
+
85
+ # Return a StreamingResponse, which will send chunks as they are yielded by generate_stream()
86
+ # media_type can be "text/event-stream" for SSE, or "text/plain" for simple newline-delimited text.
87
+ # For simplicity, we'll start with "text/plain" for easier initial client parsing.
88
+ return StreamingResponse(generate_stream(), media_type="text/plain")
89
+
90
 
91
 
92
  # below when not using langchain fully