dharmendra commited on
Commit
89183a0
·
1 Parent(s): 51e51e6

Implement streaming responses for LLM API

Browse files
Files changed (1) hide show
  1. app.py +18 -31
app.py CHANGED
@@ -11,6 +11,7 @@ from starlette.responses import StreamingResponse # <-- NEW IMPORT
11
  import asyncio
12
  from langchain_community.llms import HuggingFacePipeline
13
  import json
 
14
 
15
  app = FastAPI()
16
  # Get the Hugging Face API token from environment variables (BEST PRACTICE)
@@ -47,13 +48,27 @@ llm = HuggingFacePipeline(pipeline=pipeline(
47
  tokenizer=tokenizer,
48
  max_new_tokens=512, # Adjust as needed for desired response length
49
  return_full_text=False, # Crucial for getting only the AI's response, esp when ans is small
50
- temperature=0.7, # Controls randomness (0.0 for deterministic, 1.0 for very creative)
51
  do_sample=True # Enable sampling for more varied outputs
52
  ))
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  # Initialize Langchain ConversationChain
55
  # verbose=True for debugging LangChain's pro
56
- conversation = ConversationChain(llm=llm, memory=memory,verbose=True)
57
 
58
  class QuestionRequest(BaseModel):
59
  question: str
@@ -102,32 +117,4 @@ async def generate_text(request: QuestionRequest):
102
 
103
 
104
 
105
- # below when not using langchain fully
106
- # try:
107
- # # Retrieve history
108
- # history = memory.load_memory_variables({})['history']
109
- # # Create prompt with history and current question
110
- # prompt = f"History:\n{history}\nQuestion: {request.question}\nAnswer:"
111
-
112
- # inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
113
-
114
- # with torch.no_grad():
115
- # outputs = model.generate(
116
- # inputs=inputs['input_ids'], # Pass the 'input_ids' tensor
117
- # attention_mask=inputs['attention_mask'],
118
- # max_length=300,
119
- # num_beams=5,
120
- # no_repeat_ngram_size=2,
121
- # temperature=0.7,
122
- # top_k=50,
123
- # top_p=0.95,
124
- # do_sample=True,
125
- # eos_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>"),
126
- # pad_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>")
127
- # )
128
- # response = tokenizer.decode(outputs[0], skip_special_tokens=True)
129
- # return {"response": response}
130
- # except Exception as e:
131
- # print("Error during generation:")
132
- # traceback.print_exc()
133
- # raise HTTPException(status_code=500, detail=str(e))
 
11
  import asyncio
12
  from langchain_community.llms import HuggingFacePipeline
13
  import json
14
+ from langchain.prompts import PromptTemplate
15
 
16
  app = FastAPI()
17
  # Get the Hugging Face API token from environment variables (BEST PRACTICE)
 
48
  tokenizer=tokenizer,
49
  max_new_tokens=512, # Adjust as needed for desired response length
50
  return_full_text=False, # Crucial for getting only the AI's response, esp when ans is small
51
+ temperature=0.5, # Controls randomness (0.0 for deterministic, 1.0 for very creative)
52
  do_sample=True # Enable sampling for more varied outputs
53
  ))
54
 
55
+ template = """The following is a concise and direct conversation between a human and an AI.
56
+ The AI should provide a direct answer to the human's question and strictly avoid asking any follow-up questions.
57
+ The AI should not generate any additional conversational turns (e.g., "Human: ...").
58
+ If the AI is asked for its name, it should respond with "I am Siddhi."
59
+ If the AI does not know the answer to a question, it should truthfully state that it does not know.
60
+
61
+ Current conversation:
62
+ {history}
63
+ Human: {input}
64
+ AI:"""
65
+
66
+ PROMPT = PromptTemplate(input_variables=["history", "input"], template=template)
67
+
68
+
69
  # Initialize Langchain ConversationChain
70
  # verbose=True for debugging LangChain's pro
71
+ conversation = ConversationChain(llm=llm, memory=memory, prompt = PROMPT, verbose=True)
72
 
73
  class QuestionRequest(BaseModel):
74
  question: str
 
117
 
118
 
119
 
120
+