dharmendra commited on
Commit
48d0a68
·
1 Parent(s): 20960a5

Implement streaming responses for LLM API

Browse files
Files changed (1) hide show
  1. app.py +8 -2
app.py CHANGED
@@ -69,6 +69,7 @@ async def generate_text(request: QuestionRequest):
69
  response_stream = conversation.stream({"input": request.question})
70
 
71
  for chunk in response_stream:
 
72
  # Each chunk is typically a dictionary with a 'content' key
73
  # We want to send just the new token/text back.
74
  # Ensure the chunk is stringified and followed by a newline for client parsing.
@@ -76,12 +77,17 @@ async def generate_text(request: QuestionRequest):
76
  # yield f"data: {json.dumps({'token': chunk.content})}\n\n"
77
  # For simplicity, we'll just yield the content directly for now.
78
  if 'response' in chunk:
79
- yield chunk['response']
80
  else:
81
- yield str(chunk)
 
 
82
 
83
  await asyncio.sleep(0.01) # Small delay to allow client to process chunks
 
 
84
 
 
85
  except Exception as e:
86
  print("Error during streaming generation:")
87
  traceback.print_exc()
 
69
  response_stream = conversation.stream({"input": request.question})
70
 
71
  for chunk in response_stream:
72
+ token_content = ""
73
  # Each chunk is typically a dictionary with a 'content' key
74
  # We want to send just the new token/text back.
75
  # Ensure the chunk is stringified and followed by a newline for client parsing.
 
77
  # yield f"data: {json.dumps({'token': chunk.content})}\n\n"
78
  # For simplicity, we'll just yield the content directly for now.
79
  if 'response' in chunk:
80
+ token_content= chunk['response']
81
  else:
82
+ token_content= str(chunk)
83
+
84
+ yield json.dumps({"content":token_content}) +"\n"
85
 
86
  await asyncio.sleep(0.01) # Small delay to allow client to process chunks
87
+ #optionally send final end msg
88
+ yield json.dumps({"status":"completed"}) +"\n"
89
 
90
+
91
  except Exception as e:
92
  print("Error during streaming generation:")
93
  traceback.print_exc()