Spaces:

rathore11
/

PY_LLM_NEW

Paused

App Files Files Community

dharmendra commited on Jul 13

Commit

89183a0

1 Parent(s): 51e51e6

Implement streaming responses for LLM API

Browse files

Files changed (1) hide show

app.py +18 -31

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from starlette.responses import StreamingResponse # <-- NEW IMPORT
 import asyncio
 from langchain_community.llms import HuggingFacePipeline
 import json
 app = FastAPI()
 # Get the Hugging Face API token from environment variables (BEST PRACTICE)
@@ -47,13 +48,27 @@ llm = HuggingFacePipeline(pipeline=pipeline(
     tokenizer=tokenizer,
     max_new_tokens=512,  # Adjust as needed for desired response length
     return_full_text=False, # Crucial for getting only the AI's response, esp when ans is small
-    temperature=0.7,      # Controls randomness (0.0 for deterministic, 1.0 for very creative)
     do_sample=True        # Enable sampling for more varied outputs
     ))
 # Initialize Langchain ConversationChain
 # verbose=True for debugging LangChain's pro
-conversation = ConversationChain(llm=llm, memory=memory,verbose=True)
 class QuestionRequest(BaseModel):
     question: str
@@ -102,32 +117,4 @@ async def generate_text(request: QuestionRequest):
-    # below when not using langchain fully
-    # try:
-    #      # Retrieve history
-    #     history = memory.load_memory_variables({})['history']
-    #      # Create prompt with history and current question
-    #     prompt = f"History:\n{history}\nQuestion: {request.question}\nAnswer:"
-    #     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
-    #     with torch.no_grad():
-    #         outputs = model.generate(
-    #             inputs=inputs['input_ids'],  # Pass the 'input_ids' tensor
-    #             attention_mask=inputs['attention_mask'],
-    #             max_length=300,
-    #             num_beams=5,
-    #             no_repeat_ngram_size=2,
-    #             temperature=0.7,
-    #             top_k=50,
-    #             top_p=0.95,
-    #             do_sample=True,
-    #             eos_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>"),
-    #             pad_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>")
-    #         )
-    #     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    #     return {"response": response}
-    # except Exception as e:
-    #     print("Error during generation:")
-    #     traceback.print_exc()
-    #     raise HTTPException(status_code=500, detail=str(e))

 import asyncio
 from langchain_community.llms import HuggingFacePipeline
 import json
+from langchain.prompts import PromptTemplate
 app = FastAPI()
 # Get the Hugging Face API token from environment variables (BEST PRACTICE)
     tokenizer=tokenizer,
     max_new_tokens=512,  # Adjust as needed for desired response length
     return_full_text=False, # Crucial for getting only the AI's response, esp when ans is small
+    temperature=0.5,      # Controls randomness (0.0 for deterministic, 1.0 for very creative)
     do_sample=True        # Enable sampling for more varied outputs
     ))
+template = """The following is a concise and direct conversation between a human and an AI.
+The AI should provide a direct answer to the human's question and strictly avoid asking any follow-up questions.
+The AI should not generate any additional conversational turns (e.g., "Human: ...").
+If the AI is asked for its name, it should respond with "I am Siddhi."
+If the AI does not know the answer to a question, it should truthfully state that it does not know.
+Current conversation:
+{history}
+Human: {input}
+AI:"""
+PROMPT = PromptTemplate(input_variables=["history", "input"], template=template)
 # Initialize Langchain ConversationChain
 # verbose=True for debugging LangChain's pro
+conversation = ConversationChain(llm=llm, memory=memory, prompt = PROMPT, verbose=True)
 class QuestionRequest(BaseModel):
     question: str