Spaces:
Paused
Paused
dharmendra
commited on
Commit
·
dca8b66
1
Parent(s):
81d2ef5
quantisation added
Browse files- app.py +74 -38
- requirements.txt +1 -0
app.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import os
|
|
|
|
|
2 |
from fastapi import FastAPI, HTTPException
|
3 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
4 |
import torch
|
5 |
from pydantic import BaseModel
|
6 |
import traceback
|
@@ -28,44 +30,37 @@ try:
|
|
28 |
print("Successfully logged into Hugging Face Hub.")
|
29 |
except Exception as e:
|
30 |
print(f"Failed to log into Hugging Face Hub: {e}")
|
31 |
-
# The app will likely fail to load the model if login fails, so this print is for debugging.
|
32 |
|
33 |
-
# ---
|
34 |
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HUGGINGFACEHUB_API_TOKEN)
|
37 |
model = AutoModelForCausalLM.from_pretrained(
|
38 |
model_id,
|
39 |
-
device_map="auto", # 'auto' handles device placement, including offloading
|
40 |
-
|
|
|
41 |
trust_remote_code=True,
|
42 |
token=HUGGINGFACEHUB_API_TOKEN
|
43 |
)
|
44 |
|
45 |
-
#
|
46 |
-
#
|
47 |
-
#
|
48 |
-
|
49 |
-
# device = "mps"
|
50 |
-
# elif torch.cuda.is_available():
|
51 |
-
# device = "cuda"
|
52 |
-
# else:
|
53 |
-
# device = "cpu"
|
54 |
-
# model.to(device) # This line is removed
|
55 |
-
|
56 |
-
# k=5 means it will keep the last 5 human-AI interaction pairs (10 messages total)
|
57 |
-
memory = ConversationBufferWindowMemory(k=5)
|
58 |
-
|
59 |
-
# Initialize Langchain HuggingFacePipeline
|
60 |
-
llm = HuggingFacePipeline(pipeline=pipeline(
|
61 |
-
"text-generation",
|
62 |
-
model=model,
|
63 |
-
tokenizer=tokenizer,
|
64 |
-
max_new_tokens=512,
|
65 |
-
return_full_text=True,
|
66 |
-
temperature=0.2,
|
67 |
-
do_sample=True,
|
68 |
-
))
|
69 |
|
70 |
# --- UPDATED PROMPT TEMPLATE ---
|
71 |
template = """<|im_start|>system
|
@@ -83,21 +78,61 @@ If you do not know the answer to a question, you truthfully state that it does n
|
|
83 |
|
84 |
PROMPT = PromptTemplate(input_variables=["history", "input"], template=template)
|
85 |
|
86 |
-
# Initialize Langchain ConversationChain
|
87 |
-
conversation = ConversationChain(llm=llm, memory=memory, prompt=PROMPT, verbose=True)
|
88 |
-
|
89 |
class QuestionRequest(BaseModel):
|
90 |
question: str
|
|
|
91 |
|
92 |
class ChatResponse(BaseModel):
|
93 |
response: str
|
|
|
94 |
|
95 |
@app.post("/api/generate")
|
96 |
async def generate_text(request: QuestionRequest):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
async def generate_stream():
|
|
|
|
|
|
|
|
|
|
|
98 |
started_streaming_ai_response = False
|
99 |
|
100 |
try:
|
|
|
|
|
|
|
|
|
101 |
response_stream = conversation.stream({"input": request.question})
|
102 |
|
103 |
stop_sequences_to_check = ["Human:", "AI:", "\nHuman:", "\nAI:", "<|im_end|>"]
|
@@ -123,22 +158,23 @@ async def generate_text(request: QuestionRequest):
|
|
123 |
if stop_seq in token_content:
|
124 |
token_content = token_content.split(stop_seq, 1)[0]
|
125 |
if token_content:
|
126 |
-
yield json.dumps({"content": token_content}) + "\n"
|
127 |
await asyncio.sleep(0.01)
|
128 |
-
yield json.dumps({"status": "completed"}) + "\n"
|
129 |
return
|
130 |
|
131 |
if token_content:
|
132 |
-
yield json.dumps({"content": token_content}) + "\n"
|
133 |
await asyncio.sleep(0.01)
|
134 |
|
135 |
-
yield json.dumps({"status": "completed"}) + "\n"
|
136 |
|
137 |
except Exception as e:
|
138 |
-
print("Error during streaming generation:")
|
139 |
traceback.print_exc()
|
140 |
-
yield json.dumps({"error": str(e)}) + "\n"
|
141 |
|
|
|
142 |
return StreamingResponse(generate_stream(), media_type="application/json")
|
143 |
|
144 |
if __name__ == "__main__":
|
|
|
1 |
import os
|
2 |
+
import uuid
|
3 |
+
from typing import Dict, Optional
|
4 |
from fastapi import FastAPI, HTTPException
|
5 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig # Import BitsAndBytesConfig
|
6 |
import torch
|
7 |
from pydantic import BaseModel
|
8 |
import traceback
|
|
|
30 |
print("Successfully logged into Hugging Face Hub.")
|
31 |
except Exception as e:
|
32 |
print(f"Failed to log into Hugging Face Hub: {e}")
|
|
|
33 |
|
34 |
+
# --- Initialize tokenizer and model globally (heavy to load, shared across sessions) ---
|
35 |
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
|
36 |
|
37 |
+
# --- NEW: Quantization configuration for 4-bit loading, optimized for T4 ---
|
38 |
+
# This configuration tells Hugging Face Transformers to load the model weights
|
39 |
+
# in 4-bit precision using the bitsandbytes library.
|
40 |
+
bnb_config = BitsAndBytesConfig(
|
41 |
+
load_in_4bit=True, # Enable 4-bit quantization
|
42 |
+
bnb_4bit_quant_type="nf4", # Specify the quantization type: "nf4" (NormalFloat 4-bit) is recommended for transformers
|
43 |
+
# --- IMPORTANT CHANGE: Use float16 for compute dtype for T4 compatibility ---
|
44 |
+
# T4 GPUs (Turing architecture) do not have native bfloat16 support.
|
45 |
+
# Using float16 for computations is more efficient and prevents CPU offloading.
|
46 |
+
bnb_4bit_compute_dtype=torch.float16,
|
47 |
+
bnb_4bit_use_double_quant=True, # Use double quantization for slightly better quality
|
48 |
+
)
|
49 |
+
|
50 |
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HUGGINGFACEHUB_API_TOKEN)
|
51 |
model = AutoModelForCausalLM.from_pretrained(
|
52 |
model_id,
|
53 |
+
device_map="auto", # 'auto' handles device placement, including offloading to CPU if necessary (but quantization aims to prevent this)
|
54 |
+
quantization_config=bnb_config, # Pass the quantization configuration here
|
55 |
+
# torch_dtype=torch.bfloat16, # REMOVED: This is now handled by bnb_4bit_compute_dtype
|
56 |
trust_remote_code=True,
|
57 |
token=HUGGINGFACEHUB_API_TOKEN
|
58 |
)
|
59 |
|
60 |
+
# Global dictionary to store active conversation chains, keyed by session_id.
|
61 |
+
# IMPORTANT: In a production environment, this in-memory dictionary will reset
|
62 |
+
# if the server restarts. For true persistence, you would use a database (e.g., Redis, Firestore).
|
63 |
+
active_conversations: Dict[str, ConversationChain] = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
# --- UPDATED PROMPT TEMPLATE ---
|
66 |
template = """<|im_start|>system
|
|
|
78 |
|
79 |
PROMPT = PromptTemplate(input_variables=["history", "input"], template=template)
|
80 |
|
|
|
|
|
|
|
81 |
class QuestionRequest(BaseModel):
|
82 |
question: str
|
83 |
+
session_id: Optional[str] = None # Optional session ID for continuing conversations
|
84 |
|
85 |
class ChatResponse(BaseModel):
|
86 |
response: str
|
87 |
+
session_id: str # Include session_id in the response for client to track
|
88 |
|
89 |
@app.post("/api/generate")
|
90 |
async def generate_text(request: QuestionRequest):
|
91 |
+
"""
|
92 |
+
Handles text generation requests, maintaining conversation history per session.
|
93 |
+
"""
|
94 |
+
session_id = request.session_id
|
95 |
+
|
96 |
+
# If no session_id is provided, generate a new one.
|
97 |
+
# This signifies the start of a new conversation.
|
98 |
+
if session_id is None:
|
99 |
+
session_id = str(uuid.uuid4())
|
100 |
+
print(f"Starting new conversation with session_id: {session_id}")
|
101 |
+
|
102 |
+
# Retrieve or create a ConversationChain for this session_id
|
103 |
+
if session_id not in active_conversations:
|
104 |
+
print(f"Creating new ConversationChain for session_id: {session_id}")
|
105 |
+
# Initialize Langchain HuggingFacePipeline for this session
|
106 |
+
llm = HuggingFacePipeline(pipeline=pipeline(
|
107 |
+
"text-generation",
|
108 |
+
model=model, # Use the globally loaded model
|
109 |
+
tokenizer=tokenizer, # Use the globally loaded tokenizer
|
110 |
+
max_new_tokens=512,
|
111 |
+
return_full_text=True,
|
112 |
+
temperature=0.2,
|
113 |
+
do_sample=True,
|
114 |
+
))
|
115 |
+
# Initialize memory for this specific session
|
116 |
+
memory = ConversationBufferWindowMemory(k=5) # Remembers the last 5 human-AI interaction pairs
|
117 |
+
conversation = ConversationChain(llm=llm, memory=memory, prompt=PROMPT, verbose=True)
|
118 |
+
active_conversations[session_id] = conversation
|
119 |
+
else:
|
120 |
+
print(f"Continuing conversation for session_id: {session_id}")
|
121 |
+
conversation = active_conversations[session_id]
|
122 |
+
|
123 |
async def generate_stream():
|
124 |
+
"""
|
125 |
+
An asynchronous generator function to stream text responses token-by-token.
|
126 |
+
Each yielded item will be a JSON string representing a part of the stream.
|
127 |
+
"""
|
128 |
+
# Flag to indicate when we've started streaming the AI's actual response
|
129 |
started_streaming_ai_response = False
|
130 |
|
131 |
try:
|
132 |
+
# First, send a JSON object containing the session_id.
|
133 |
+
# This allows the client to immediately get the session ID.
|
134 |
+
yield json.dumps({"type": "session_info", "session_id": session_id}) + "\n"
|
135 |
+
|
136 |
response_stream = conversation.stream({"input": request.question})
|
137 |
|
138 |
stop_sequences_to_check = ["Human:", "AI:", "\nHuman:", "\nAI:", "<|im_end|>"]
|
|
|
158 |
if stop_seq in token_content:
|
159 |
token_content = token_content.split(stop_seq, 1)[0]
|
160 |
if token_content:
|
161 |
+
yield json.dumps({"type": "token", "content": token_content}) + "\n"
|
162 |
await asyncio.sleep(0.01)
|
163 |
+
yield json.dumps({"type": "end", "status": "completed", "session_id": session_id}) + "\n"
|
164 |
return
|
165 |
|
166 |
if token_content:
|
167 |
+
yield json.dumps({"type": "token", "content": token_content}) + "\n"
|
168 |
await asyncio.sleep(0.01)
|
169 |
|
170 |
+
yield json.dumps({"type": "end", "status": "completed", "session_id": session_id}) + "\n"
|
171 |
|
172 |
except Exception as e:
|
173 |
+
print(f"Error during streaming generation for session {session_id}:")
|
174 |
traceback.print_exc()
|
175 |
+
yield json.dumps({"type": "error", "message": str(e), "session_id": session_id}) + "\n"
|
176 |
|
177 |
+
# Return a StreamingResponse with application/json media type
|
178 |
return StreamingResponse(generate_stream(), media_type="application/json")
|
179 |
|
180 |
if __name__ == "__main__":
|
requirements.txt
CHANGED
@@ -69,3 +69,4 @@ uvicorn==0.34.0
|
|
69 |
yarl==1.19.0
|
70 |
zstandard==0.23.0
|
71 |
protobuf
|
|
|
|
69 |
yarl==1.19.0
|
70 |
zstandard==0.23.0
|
71 |
protobuf
|
72 |
+
bitsandbytes==0.43.0
|