Spaces:

rathore11
/

PY_LLM_NEW

Paused

App Files Files Community

dharmendra commited on Jul 20

Commit

dca8b66

1 Parent(s): 81d2ef5

quantisation added

Browse files

Files changed (2) hide show

app.py +74 -38
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import os
 from fastapi import FastAPI, HTTPException
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
 from pydantic import BaseModel
 import traceback
@@ -28,44 +30,37 @@ try:
     print("Successfully logged into Hugging Face Hub.")
 except Exception as e:
     print(f"Failed to log into Hugging Face Hub: {e}")
-    # The app will likely fail to load the model if login fails, so this print is for debugging.
-# --- Use Mistral 7B Instruct v0.3 model ---
 model_id = "mistralai/Mistral-7B-Instruct-v0.3"
 tokenizer = AutoTokenizer.from_pretrained(model_id, token=HUGGINGFACEHUB_API_TOKEN)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    device_map="auto", # 'auto' handles device placement, including offloading
-    torch_dtype=torch.bfloat16,
     trust_remote_code=True,
     token=HUGGINGFACEHUB_API_TOKEN
 )
-# --- REMOVED: model.to(device) ---
-# When device_map="auto" is used, accelerate handles device placement.
-# Manually moving the model can cause conflicts and RuntimeErrors.
-# if torch.backends.mps.is_available():
-#     device = "mps"
-# elif torch.cuda.is_available():
-#     device = "cuda"
-# else:
-#     device = "cpu"
-# model.to(device) # This line is removed
-# k=5 means it will keep the last 5 human-AI interaction pairs (10 messages total)
-memory = ConversationBufferWindowMemory(k=5)
-# Initialize Langchain HuggingFacePipeline
-llm = HuggingFacePipeline(pipeline=pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-    max_new_tokens=512,
-    return_full_text=True,
-    temperature=0.2,
-    do_sample=True,
-))
 # --- UPDATED PROMPT TEMPLATE ---
 template = """<|im_start|>system
@@ -83,21 +78,61 @@ If you do not know the answer to a question, you truthfully state that it does n
 PROMPT = PromptTemplate(input_variables=["history", "input"], template=template)
-# Initialize Langchain ConversationChain
-conversation = ConversationChain(llm=llm, memory=memory, prompt=PROMPT, verbose=True)
 class QuestionRequest(BaseModel):
     question: str
 class ChatResponse(BaseModel):
     response: str
 @app.post("/api/generate")
 async def generate_text(request: QuestionRequest):
     async def generate_stream():
         started_streaming_ai_response = False
         try:
             response_stream = conversation.stream({"input": request.question})
             stop_sequences_to_check = ["Human:", "AI:", "\nHuman:", "\nAI:", "<|im_end|>"]
@@ -123,22 +158,23 @@ async def generate_text(request: QuestionRequest):
                     if stop_seq in token_content:
                         token_content = token_content.split(stop_seq, 1)[0]
                         if token_content:
-                            yield json.dumps({"content": token_content}) + "\n"
                             await asyncio.sleep(0.01)
-                        yield json.dumps({"status": "completed"}) + "\n"
                         return
                 if token_content:
-                    yield json.dumps({"content": token_content}) + "\n"
                     await asyncio.sleep(0.01)
-            yield json.dumps({"status": "completed"}) + "\n"
         except Exception as e:
-            print("Error during streaming generation:")
             traceback.print_exc()
-            yield json.dumps({"error": str(e)}) + "\n"
     return StreamingResponse(generate_stream(), media_type="application/json")
 if __name__ == "__main__":

 import os
+import uuid
+from typing import Dict, Optional
 from fastapi import FastAPI, HTTPException
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig # Import BitsAndBytesConfig
 import torch
 from pydantic import BaseModel
 import traceback
     print("Successfully logged into Hugging Face Hub.")
 except Exception as e:
     print(f"Failed to log into Hugging Face Hub: {e}")
+# --- Initialize tokenizer and model globally (heavy to load, shared across sessions) ---
 model_id = "mistralai/Mistral-7B-Instruct-v0.3"
+# --- NEW: Quantization configuration for 4-bit loading, optimized for T4 ---
+# This configuration tells Hugging Face Transformers to load the model weights
+# in 4-bit precision using the bitsandbytes library.
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True, # Enable 4-bit quantization
+    bnb_4bit_quant_type="nf4", # Specify the quantization type: "nf4" (NormalFloat 4-bit) is recommended for transformers
+    # --- IMPORTANT CHANGE: Use float16 for compute dtype for T4 compatibility ---
+    # T4 GPUs (Turing architecture) do not have native bfloat16 support.
+    # Using float16 for computations is more efficient and prevents CPU offloading.
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True, # Use double quantization for slightly better quality
+)
 tokenizer = AutoTokenizer.from_pretrained(model_id, token=HUGGINGFACEHUB_API_TOKEN)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    device_map="auto", # 'auto' handles device placement, including offloading to CPU if necessary (but quantization aims to prevent this)
+    quantization_config=bnb_config, # Pass the quantization configuration here
+    # torch_dtype=torch.bfloat16, # REMOVED: This is now handled by bnb_4bit_compute_dtype
     trust_remote_code=True,
     token=HUGGINGFACEHUB_API_TOKEN
 )
+# Global dictionary to store active conversation chains, keyed by session_id.
+# IMPORTANT: In a production environment, this in-memory dictionary will reset
+# if the server restarts. For true persistence, you would use a database (e.g., Redis, Firestore).
+active_conversations: Dict[str, ConversationChain] = {}
 # --- UPDATED PROMPT TEMPLATE ---
 template = """<|im_start|>system
 PROMPT = PromptTemplate(input_variables=["history", "input"], template=template)
 class QuestionRequest(BaseModel):
     question: str
+    session_id: Optional[str] = None # Optional session ID for continuing conversations
 class ChatResponse(BaseModel):
     response: str
+    session_id: str # Include session_id in the response for client to track
 @app.post("/api/generate")
 async def generate_text(request: QuestionRequest):
+    """
+    Handles text generation requests, maintaining conversation history per session.
+    """
+    session_id = request.session_id
+    # If no session_id is provided, generate a new one.
+    # This signifies the start of a new conversation.
+    if session_id is None:
+        session_id = str(uuid.uuid4())
+        print(f"Starting new conversation with session_id: {session_id}")
+    # Retrieve or create a ConversationChain for this session_id
+    if session_id not in active_conversations:
+        print(f"Creating new ConversationChain for session_id: {session_id}")
+        # Initialize Langchain HuggingFacePipeline for this session
+        llm = HuggingFacePipeline(pipeline=pipeline(
+            "text-generation",
+            model=model, # Use the globally loaded model
+            tokenizer=tokenizer, # Use the globally loaded tokenizer
+            max_new_tokens=512,
+            return_full_text=True,
+            temperature=0.2,
+            do_sample=True,
+        ))
+        # Initialize memory for this specific session
+        memory = ConversationBufferWindowMemory(k=5) # Remembers the last 5 human-AI interaction pairs
+        conversation = ConversationChain(llm=llm, memory=memory, prompt=PROMPT, verbose=True)
+        active_conversations[session_id] = conversation
+    else:
+        print(f"Continuing conversation for session_id: {session_id}")
+        conversation = active_conversations[session_id]
     async def generate_stream():
+        """
+        An asynchronous generator function to stream text responses token-by-token.
+        Each yielded item will be a JSON string representing a part of the stream.
+        """
+        # Flag to indicate when we've started streaming the AI's actual response
         started_streaming_ai_response = False
         try:
+            # First, send a JSON object containing the session_id.
+            # This allows the client to immediately get the session ID.
+            yield json.dumps({"type": "session_info", "session_id": session_id}) + "\n"
             response_stream = conversation.stream({"input": request.question})
             stop_sequences_to_check = ["Human:", "AI:", "\nHuman:", "\nAI:", "<|im_end|>"]
                     if stop_seq in token_content:
                         token_content = token_content.split(stop_seq, 1)[0]
                         if token_content:
+                            yield json.dumps({"type": "token", "content": token_content}) + "\n"
                             await asyncio.sleep(0.01)
+                        yield json.dumps({"type": "end", "status": "completed", "session_id": session_id}) + "\n"
                         return
                 if token_content:
+                    yield json.dumps({"type": "token", "content": token_content}) + "\n"
                     await asyncio.sleep(0.01)
+            yield json.dumps({"type": "end", "status": "completed", "session_id": session_id}) + "\n"
         except Exception as e:
+            print(f"Error during streaming generation for session {session_id}:")
             traceback.print_exc()
+            yield json.dumps({"type": "error", "message": str(e), "session_id": session_id}) + "\n"
+    # Return a StreamingResponse with application/json media type
     return StreamingResponse(generate_stream(), media_type="application/json")
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -69,3 +69,4 @@ uvicorn==0.34.0
 yarl==1.19.0
 zstandard==0.23.0
 protobuf

 yarl==1.19.0
 zstandard==0.23.0
 protobuf
+bitsandbytes==0.43.0