Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,777 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import uuid
|
5 |
+
from datetime import datetime
|
6 |
+
from typing import List, Dict, Any, Optional
|
7 |
+
import json
|
8 |
+
import asyncio
|
9 |
+
from dataclasses import dataclass, asdict
|
10 |
+
import logging
|
11 |
+
|
12 |
+
# Document processing imports
|
13 |
+
import PyPDF2
|
14 |
+
import pandas as pd
|
15 |
+
from docx import Document
|
16 |
+
from pptx import Presentation
|
17 |
+
import markdown
|
18 |
+
|
19 |
+
# ML/AI imports
|
20 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
21 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
22 |
+
from langchain.vectorstores import FAISS
|
23 |
+
from langchain.schema import Document as LCDocument
|
24 |
+
from huggingface_hub import InferenceClient
|
25 |
+
|
26 |
+
# Setup logging
|
27 |
+
logging.basicConfig(level=logging.INFO)
|
28 |
+
logger = logging.getLogger(__name__)
|
29 |
+
|
30 |
+
# MCP Message Structure
|
31 |
+
@dataclass
|
32 |
+
class MCPMessage:
|
33 |
+
sender: str
|
34 |
+
receiver: str
|
35 |
+
type: str
|
36 |
+
trace_id: str
|
37 |
+
payload: Dict[str, Any]
|
38 |
+
timestamp: str = None
|
39 |
+
|
40 |
+
def __post_init__(self):
|
41 |
+
if self.timestamp is None:
|
42 |
+
self.timestamp = datetime.now().isoformat()
|
43 |
+
|
44 |
+
def to_dict(self):
|
45 |
+
return asdict(self)
|
46 |
+
|
47 |
+
# MCP Communication Layer
|
48 |
+
class MCPCommunicator:
|
49 |
+
def __init__(self):
|
50 |
+
self.message_queue = asyncio.Queue()
|
51 |
+
self.subscribers = {}
|
52 |
+
|
53 |
+
async def send_message(self, message: MCPMessage):
|
54 |
+
logger.info(f"MCP: {message.sender} -> {message.receiver}: {message.type}")
|
55 |
+
await self.message_queue.put(message)
|
56 |
+
|
57 |
+
async def receive_message(self, agent_name: str) -> MCPMessage:
|
58 |
+
while True:
|
59 |
+
message = await self.message_queue.get()
|
60 |
+
if message.receiver == agent_name:
|
61 |
+
return message
|
62 |
+
# Re-queue if not for this agent
|
63 |
+
await self.message_queue.put(message)
|
64 |
+
|
65 |
+
# Global MCP instance
|
66 |
+
mcp = MCPCommunicator()
|
67 |
+
|
68 |
+
# Base Agent Class
|
69 |
+
class BaseAgent:
|
70 |
+
def __init__(self, name: str):
|
71 |
+
self.name = name
|
72 |
+
self.mcp = mcp
|
73 |
+
|
74 |
+
async def send_mcp_message(self, receiver: str, msg_type: str, payload: Dict[str, Any], trace_id: str):
|
75 |
+
message = MCPMessage(
|
76 |
+
sender=self.name,
|
77 |
+
receiver=receiver,
|
78 |
+
type=msg_type,
|
79 |
+
trace_id=trace_id,
|
80 |
+
payload=payload
|
81 |
+
)
|
82 |
+
await self.mcp.send_message(message)
|
83 |
+
|
84 |
+
async def receive_mcp_message(self) -> MCPMessage:
|
85 |
+
return await self.mcp.receive_message(self.name)
|
86 |
+
|
87 |
+
# Document Ingestion Agent
|
88 |
+
class IngestionAgent(BaseAgent):
|
89 |
+
def __init__(self):
|
90 |
+
super().__init__("IngestionAgent")
|
91 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
92 |
+
chunk_size=1000,
|
93 |
+
chunk_overlap=200,
|
94 |
+
length_function=len,
|
95 |
+
)
|
96 |
+
|
97 |
+
def parse_pdf(self, file_path: str) -> str:
|
98 |
+
"""Parse PDF file and extract text"""
|
99 |
+
try:
|
100 |
+
with open(file_path, 'rb') as file:
|
101 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
102 |
+
text = ""
|
103 |
+
for page in pdf_reader.pages:
|
104 |
+
text += page.extract_text() + "\n"
|
105 |
+
return text
|
106 |
+
except Exception as e:
|
107 |
+
logger.error(f"Error parsing PDF: {e}")
|
108 |
+
return ""
|
109 |
+
|
110 |
+
def parse_docx(self, file_path: str) -> str:
|
111 |
+
"""Parse DOCX file and extract text"""
|
112 |
+
try:
|
113 |
+
doc = Document(file_path)
|
114 |
+
text = ""
|
115 |
+
for paragraph in doc.paragraphs:
|
116 |
+
text += paragraph.text + "\n"
|
117 |
+
return text
|
118 |
+
except Exception as e:
|
119 |
+
logger.error(f"Error parsing DOCX: {e}")
|
120 |
+
return ""
|
121 |
+
|
122 |
+
def parse_pptx(self, file_path: str) -> str:
|
123 |
+
"""Parse PPTX file and extract text"""
|
124 |
+
try:
|
125 |
+
prs = Presentation(file_path)
|
126 |
+
text = ""
|
127 |
+
for slide_num, slide in enumerate(prs.slides, 1):
|
128 |
+
text += f"Slide {slide_num}:\n"
|
129 |
+
for shape in slide.shapes:
|
130 |
+
if hasattr(shape, "text"):
|
131 |
+
text += shape.text + "\n"
|
132 |
+
text += "\n"
|
133 |
+
return text
|
134 |
+
except Exception as e:
|
135 |
+
logger.error(f"Error parsing PPTX: {e}")
|
136 |
+
return ""
|
137 |
+
|
138 |
+
def parse_csv(self, file_path: str) -> str:
|
139 |
+
"""Parse CSV file and convert to text"""
|
140 |
+
try:
|
141 |
+
df = pd.read_csv(file_path)
|
142 |
+
return df.to_string()
|
143 |
+
except Exception as e:
|
144 |
+
logger.error(f"Error parsing CSV: {e}")
|
145 |
+
return ""
|
146 |
+
|
147 |
+
def parse_txt_md(self, file_path: str) -> str:
|
148 |
+
"""Parse TXT or MD file"""
|
149 |
+
try:
|
150 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
151 |
+
content = file.read()
|
152 |
+
# If markdown, convert to plain text
|
153 |
+
if file_path.lower().endswith('.md'):
|
154 |
+
content = markdown.markdown(content)
|
155 |
+
return content
|
156 |
+
except Exception as e:
|
157 |
+
logger.error(f"Error parsing TXT/MD: {e}")
|
158 |
+
return ""
|
159 |
+
|
160 |
+
async def process_documents(self, files: List[str], trace_id: str) -> List[LCDocument]:
|
161 |
+
"""Process uploaded documents and return chunked documents"""
|
162 |
+
all_documents = []
|
163 |
+
|
164 |
+
for file_path in files:
|
165 |
+
file_ext = os.path.splitext(file_path)[1].lower()
|
166 |
+
filename = os.path.basename(file_path)
|
167 |
+
|
168 |
+
# Parse based on file extension
|
169 |
+
if file_ext == '.pdf':
|
170 |
+
content = self.parse_pdf(file_path)
|
171 |
+
elif file_ext == '.docx':
|
172 |
+
content = self.parse_docx(file_path)
|
173 |
+
elif file_ext == '.pptx':
|
174 |
+
content = self.parse_pptx(file_path)
|
175 |
+
elif file_ext == '.csv':
|
176 |
+
content = self.parse_csv(file_path)
|
177 |
+
elif file_ext in ['.txt', '.md']:
|
178 |
+
content = self.parse_txt_md(file_path)
|
179 |
+
else:
|
180 |
+
logger.warning(f"Unsupported file type: {file_ext}")
|
181 |
+
continue
|
182 |
+
|
183 |
+
if content.strip():
|
184 |
+
# Split content into chunks
|
185 |
+
chunks = self.text_splitter.split_text(content)
|
186 |
+
|
187 |
+
# Create LangChain documents
|
188 |
+
for i, chunk in enumerate(chunks):
|
189 |
+
doc = LCDocument(
|
190 |
+
page_content=chunk,
|
191 |
+
metadata={
|
192 |
+
"source": filename,
|
193 |
+
"chunk_id": i,
|
194 |
+
"file_type": file_ext
|
195 |
+
}
|
196 |
+
)
|
197 |
+
all_documents.append(doc)
|
198 |
+
|
199 |
+
return all_documents
|
200 |
+
|
201 |
+
# Retrieval Agent
|
202 |
+
class RetrievalAgent(BaseAgent):
|
203 |
+
def __init__(self):
|
204 |
+
super().__init__("RetrievalAgent")
|
205 |
+
self.embeddings = HuggingFaceEmbeddings(
|
206 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
207 |
+
)
|
208 |
+
self.vector_store = None
|
209 |
+
|
210 |
+
async def create_vector_store(self, documents: List[LCDocument], trace_id: str):
|
211 |
+
"""Create vector store from documents"""
|
212 |
+
try:
|
213 |
+
if documents:
|
214 |
+
self.vector_store = FAISS.from_documents(documents, self.embeddings)
|
215 |
+
logger.info(f"Created vector store with {len(documents)} documents")
|
216 |
+
else:
|
217 |
+
logger.warning("No documents to create vector store")
|
218 |
+
except Exception as e:
|
219 |
+
logger.error(f"Error creating vector store: {e}")
|
220 |
+
|
221 |
+
async def retrieve_relevant_chunks(self, query: str, k: int = 5, trace_id: str = None) -> List[Dict]:
|
222 |
+
"""Retrieve relevant chunks for a query"""
|
223 |
+
if not self.vector_store:
|
224 |
+
return []
|
225 |
+
|
226 |
+
try:
|
227 |
+
# Similarity search
|
228 |
+
docs = self.vector_store.similarity_search(query, k=k)
|
229 |
+
|
230 |
+
# Format results
|
231 |
+
results = []
|
232 |
+
for doc in docs:
|
233 |
+
results.append({
|
234 |
+
"content": doc.page_content,
|
235 |
+
"source": doc.metadata.get("source", "Unknown"),
|
236 |
+
"chunk_id": doc.metadata.get("chunk_id", 0),
|
237 |
+
"file_type": doc.metadata.get("file_type", "Unknown")
|
238 |
+
})
|
239 |
+
|
240 |
+
return results
|
241 |
+
except Exception as e:
|
242 |
+
logger.error(f"Error retrieving chunks: {e}")
|
243 |
+
return []
|
244 |
+
|
245 |
+
# LLM Response Agent
|
246 |
+
class LLMResponseAgent(BaseAgent):
|
247 |
+
def __init__(self, hf_token: str = None):
|
248 |
+
super().__init__("LLMResponseAgent")
|
249 |
+
self.client = InferenceClient(
|
250 |
+
model="meta-llama/Llama-3.1-8B-Instruct",
|
251 |
+
token=hf_token
|
252 |
+
)
|
253 |
+
|
254 |
+
def format_prompt(self, query: str, context_chunks: List[Dict]) -> str:
|
255 |
+
"""Format prompt with context and query"""
|
256 |
+
context_text = "\n\n".join([
|
257 |
+
f"Source: {chunk['source']}\nContent: {chunk['content']}"
|
258 |
+
for chunk in context_chunks
|
259 |
+
])
|
260 |
+
|
261 |
+
prompt = f"""Based on the following context from uploaded documents, please answer the user's question.
|
262 |
+
|
263 |
+
Context:
|
264 |
+
{context_text}
|
265 |
+
|
266 |
+
Question: {query}
|
267 |
+
|
268 |
+
Please provide a comprehensive answer based on the context above. If the context doesn't contain enough information to fully answer the question, please mention what information is available and what might be missing.
|
269 |
+
|
270 |
+
Answer:"""
|
271 |
+
|
272 |
+
return prompt
|
273 |
+
|
274 |
+
async def generate_response(self, query: str, context_chunks: List[Dict], trace_id: str) -> str:
|
275 |
+
"""Generate response using LLM"""
|
276 |
+
try:
|
277 |
+
prompt = self.format_prompt(query, context_chunks)
|
278 |
+
|
279 |
+
# Generate response using HuggingFace Inference
|
280 |
+
response = self.client.text_generation(
|
281 |
+
prompt,
|
282 |
+
max_new_tokens=512,
|
283 |
+
temperature=0.7,
|
284 |
+
do_sample=True,
|
285 |
+
return_full_text=False
|
286 |
+
)
|
287 |
+
|
288 |
+
return response
|
289 |
+
except Exception as e:
|
290 |
+
logger.error(f"Error generating response: {e}")
|
291 |
+
return f"I apologize, but I encountered an error while generating the response: {str(e)}"
|
292 |
+
|
293 |
+
# Coordinator Agent
|
294 |
+
class CoordinatorAgent(BaseAgent):
|
295 |
+
def __init__(self, hf_token: str = None):
|
296 |
+
super().__init__("CoordinatorAgent")
|
297 |
+
self.ingestion_agent = IngestionAgent()
|
298 |
+
self.retrieval_agent = RetrievalAgent()
|
299 |
+
self.llm_agent = LLMResponseAgent(hf_token)
|
300 |
+
self.documents_processed = False
|
301 |
+
|
302 |
+
async def process_documents(self, files: List[str]) -> str:
|
303 |
+
"""Orchestrate document processing"""
|
304 |
+
trace_id = str(uuid.uuid4())
|
305 |
+
|
306 |
+
try:
|
307 |
+
# Step 1: Ingestion
|
308 |
+
await self.send_mcp_message(
|
309 |
+
"IngestionAgent",
|
310 |
+
"DOCUMENT_INGESTION_REQUEST",
|
311 |
+
{"files": files},
|
312 |
+
trace_id
|
313 |
+
)
|
314 |
+
|
315 |
+
documents = await self.ingestion_agent.process_documents(files, trace_id)
|
316 |
+
|
317 |
+
await self.send_mcp_message(
|
318 |
+
"RetrievalAgent",
|
319 |
+
"VECTOR_STORE_CREATE_REQUEST",
|
320 |
+
{"documents": len(documents)},
|
321 |
+
trace_id
|
322 |
+
)
|
323 |
+
|
324 |
+
# Step 2: Create vector store
|
325 |
+
await self.retrieval_agent.create_vector_store(documents, trace_id)
|
326 |
+
|
327 |
+
self.documents_processed = True
|
328 |
+
|
329 |
+
return f"Successfully processed {len(documents)} document chunks from {len(files)} files."
|
330 |
+
|
331 |
+
except Exception as e:
|
332 |
+
logger.error(f"Error in document processing: {e}")
|
333 |
+
return f"Error processing documents: {str(e)}"
|
334 |
+
|
335 |
+
async def answer_query(self, query: str) -> tuple[str, List[Dict]]:
|
336 |
+
"""Orchestrate query answering"""
|
337 |
+
if not self.documents_processed:
|
338 |
+
return "Please upload and process documents first.", []
|
339 |
+
|
340 |
+
trace_id = str(uuid.uuid4())
|
341 |
+
|
342 |
+
try:
|
343 |
+
# Step 1: Retrieval
|
344 |
+
await self.send_mcp_message(
|
345 |
+
"RetrievalAgent",
|
346 |
+
"RETRIEVAL_REQUEST",
|
347 |
+
{"query": query},
|
348 |
+
trace_id
|
349 |
+
)
|
350 |
+
|
351 |
+
context_chunks = await self.retrieval_agent.retrieve_relevant_chunks(query, k=5, trace_id=trace_id)
|
352 |
+
|
353 |
+
# Step 2: LLM Response
|
354 |
+
await self.send_mcp_message(
|
355 |
+
"LLMResponseAgent",
|
356 |
+
"LLM_GENERATION_REQUEST",
|
357 |
+
{"query": query, "context_chunks": len(context_chunks)},
|
358 |
+
trace_id
|
359 |
+
)
|
360 |
+
|
361 |
+
response = await self.llm_agent.generate_response(query, context_chunks, trace_id)
|
362 |
+
|
363 |
+
return response, context_chunks
|
364 |
+
|
365 |
+
except Exception as e:
|
366 |
+
logger.error(f"Error in query processing: {e}")
|
367 |
+
return f"Error processing query: {str(e)}", []
|
368 |
+
|
369 |
+
# Global coordinator instance
|
370 |
+
coordinator = None
|
371 |
+
|
372 |
+
def initialize_app(hf_token):
|
373 |
+
"""Initialize the application with HuggingFace token"""
|
374 |
+
global coordinator
|
375 |
+
coordinator = CoordinatorAgent(hf_token)
|
376 |
+
return "β
Application initialized successfully!"
|
377 |
+
|
378 |
+
async def process_files(files):
|
379 |
+
"""Process uploaded files"""
|
380 |
+
if not coordinator:
|
381 |
+
return "β Please set your HuggingFace token first!"
|
382 |
+
|
383 |
+
if not files:
|
384 |
+
return "β Please upload at least one file."
|
385 |
+
|
386 |
+
# Save uploaded files to temporary directory
|
387 |
+
file_paths = []
|
388 |
+
for file in files:
|
389 |
+
temp_path = os.path.join(tempfile.gettempdir(), file.name)
|
390 |
+
with open(temp_path, 'wb') as f:
|
391 |
+
f.write(file.read())
|
392 |
+
file_paths.append(temp_path)
|
393 |
+
|
394 |
+
result = await coordinator.process_documents(file_paths)
|
395 |
+
|
396 |
+
# Cleanup temporary files
|
397 |
+
for path in file_paths:
|
398 |
+
try:
|
399 |
+
os.remove(path)
|
400 |
+
except:
|
401 |
+
pass
|
402 |
+
|
403 |
+
return result
|
404 |
+
|
405 |
+
async def answer_question(query, history):
|
406 |
+
"""Answer user question"""
|
407 |
+
if not coordinator:
|
408 |
+
return "β Please set your HuggingFace token first!"
|
409 |
+
|
410 |
+
if not query.strip():
|
411 |
+
return "β Please enter a question."
|
412 |
+
|
413 |
+
response, context_chunks = await coordinator.answer_query(query)
|
414 |
+
|
415 |
+
# Format response with sources
|
416 |
+
if context_chunks:
|
417 |
+
sources = "\n\n**Sources:**\n"
|
418 |
+
for i, chunk in enumerate(context_chunks[:3], 1): # Show top 3 sources
|
419 |
+
sources += f"{i}. {chunk['source']} (Chunk {chunk['chunk_id']})\n"
|
420 |
+
response += sources
|
421 |
+
|
422 |
+
return response
|
423 |
+
|
424 |
+
# Custom CSS
|
425 |
+
custom_css = """
|
426 |
+
/* Main container styling */
|
427 |
+
.gradio-container {
|
428 |
+
max-width: 1200px !important;
|
429 |
+
margin: 0 auto !important;
|
430 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif !important;
|
431 |
+
}
|
432 |
+
|
433 |
+
/* Header styling */
|
434 |
+
.header-container {
|
435 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
|
436 |
+
color: white !important;
|
437 |
+
padding: 2rem !important;
|
438 |
+
border-radius: 15px !important;
|
439 |
+
margin-bottom: 2rem !important;
|
440 |
+
text-align: center !important;
|
441 |
+
box-shadow: 0 8px 32px rgba(0,0,0,0.1) !important;
|
442 |
+
}
|
443 |
+
|
444 |
+
.header-title {
|
445 |
+
font-size: 2.5rem !important;
|
446 |
+
font-weight: 700 !important;
|
447 |
+
margin-bottom: 0.5rem !important;
|
448 |
+
text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
|
449 |
+
}
|
450 |
+
|
451 |
+
.header-subtitle {
|
452 |
+
font-size: 1.2rem !important;
|
453 |
+
opacity: 0.9 !important;
|
454 |
+
font-weight: 300 !important;
|
455 |
+
}
|
456 |
+
|
457 |
+
/* Tab styling */
|
458 |
+
.tab-nav {
|
459 |
+
background: white !important;
|
460 |
+
border-radius: 12px !important;
|
461 |
+
box-shadow: 0 4px 20px rgba(0,0,0,0.08) !important;
|
462 |
+
padding: 0.5rem !important;
|
463 |
+
margin-bottom: 1rem !important;
|
464 |
+
}
|
465 |
+
|
466 |
+
/* Card styling */
|
467 |
+
.setup-card, .upload-card, .chat-card {
|
468 |
+
background: white !important;
|
469 |
+
border-radius: 15px !important;
|
470 |
+
padding: 2rem !important;
|
471 |
+
box-shadow: 0 4px 20px rgba(0,0,0,0.08) !important;
|
472 |
+
border: 1px solid #e1e5e9 !important;
|
473 |
+
margin-bottom: 1.5rem !important;
|
474 |
+
}
|
475 |
+
|
476 |
+
/* Button styling */
|
477 |
+
.primary-button {
|
478 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
|
479 |
+
color: white !important;
|
480 |
+
border: none !important;
|
481 |
+
border-radius: 10px !important;
|
482 |
+
padding: 0.75rem 2rem !important;
|
483 |
+
font-weight: 600 !important;
|
484 |
+
transition: all 0.3s ease !important;
|
485 |
+
box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3) !important;
|
486 |
+
}
|
487 |
+
|
488 |
+
.primary-button:hover {
|
489 |
+
transform: translateY(-2px) !important;
|
490 |
+
box-shadow: 0 6px 20px rgba(102, 126, 234, 0.4) !important;
|
491 |
+
}
|
492 |
+
|
493 |
+
/* Chat interface styling */
|
494 |
+
.chat-container {
|
495 |
+
max-height: 600px !important;
|
496 |
+
overflow-y: auto !important;
|
497 |
+
background: #f8f9fa !important;
|
498 |
+
border-radius: 15px !important;
|
499 |
+
padding: 1rem !important;
|
500 |
+
border: 1px solid #e1e5e9 !important;
|
501 |
+
}
|
502 |
+
|
503 |
+
/* Input styling */
|
504 |
+
.input-container input, .input-container textarea {
|
505 |
+
border: 2px solid #e1e5e9 !important;
|
506 |
+
border-radius: 10px !important;
|
507 |
+
padding: 0.75rem 1rem !important;
|
508 |
+
font-size: 1rem !important;
|
509 |
+
transition: border-color 0.3s ease !important;
|
510 |
+
}
|
511 |
+
|
512 |
+
.input-container input:focus, .input-container textarea:focus {
|
513 |
+
border-color: #667eea !important;
|
514 |
+
box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
|
515 |
+
outline: none !important;
|
516 |
+
}
|
517 |
+
|
518 |
+
/* Status indicators */
|
519 |
+
.status-success {
|
520 |
+
color: #28a745 !important;
|
521 |
+
background: #d4edda !important;
|
522 |
+
padding: 0.75rem 1rem !important;
|
523 |
+
border-radius: 8px !important;
|
524 |
+
border: 1px solid #c3e6cb !important;
|
525 |
+
margin: 1rem 0 !important;
|
526 |
+
}
|
527 |
+
|
528 |
+
.status-error {
|
529 |
+
color: #dc3545 !important;
|
530 |
+
background: #f8d7da !important;
|
531 |
+
padding: 0.75rem 1rem !important;
|
532 |
+
border-radius: 8px !important;
|
533 |
+
border: 1px solid #f5c6cb !important;
|
534 |
+
margin: 1rem 0 !important;
|
535 |
+
}
|
536 |
+
|
537 |
+
/* File upload styling */
|
538 |
+
.file-upload {
|
539 |
+
border: 2px dashed #667eea !important;
|
540 |
+
border-radius: 15px !important;
|
541 |
+
padding: 2rem !important;
|
542 |
+
text-align: center !important;
|
543 |
+
background: #f8f9ff !important;
|
544 |
+
transition: all 0.3s ease !important;
|
545 |
+
}
|
546 |
+
|
547 |
+
.file-upload:hover {
|
548 |
+
border-color: #764ba2 !important;
|
549 |
+
background: #f0f4ff !important;
|
550 |
+
}
|
551 |
+
|
552 |
+
/* Architecture diagram container */
|
553 |
+
.architecture-container {
|
554 |
+
background: white !important;
|
555 |
+
border-radius: 15px !important;
|
556 |
+
padding: 2rem !important;
|
557 |
+
margin: 1rem 0 !important;
|
558 |
+
box-shadow: 0 4px 20px rgba(0,0,0,0.08) !important;
|
559 |
+
text-align: center !important;
|
560 |
+
}
|
561 |
+
|
562 |
+
/* Responsive design */
|
563 |
+
@media (max-width: 768px) {
|
564 |
+
.header-title {
|
565 |
+
font-size: 2rem !important;
|
566 |
+
}
|
567 |
+
|
568 |
+
.setup-card, .upload-card, .chat-card {
|
569 |
+
padding: 1.5rem !important;
|
570 |
+
}
|
571 |
+
}
|
572 |
+
|
573 |
+
/* Animation for loading states */
|
574 |
+
@keyframes pulse {
|
575 |
+
0% { opacity: 1; }
|
576 |
+
50% { opacity: 0.5; }
|
577 |
+
100% { opacity: 1; }
|
578 |
+
}
|
579 |
+
|
580 |
+
.loading {
|
581 |
+
animation: pulse 1.5s ease-in-out infinite !important;
|
582 |
+
}
|
583 |
+
"""
|
584 |
+
|
585 |
+
# Create Gradio Interface
|
586 |
+
def create_interface():
|
587 |
+
with gr.Blocks(css=custom_css, title="π€ Agentic RAG Chatbot") as demo:
|
588 |
+
gr.HTML("""
|
589 |
+
<div class="header-container">
|
590 |
+
<h1 class="header-title">π€ Agentic RAG Chatbot</h1>
|
591 |
+
<p class="header-subtitle">Multi-Format Document QA using Model Context Protocol (MCP)</p>
|
592 |
+
</div>
|
593 |
+
""")
|
594 |
+
|
595 |
+
with gr.Tabs() as tabs:
|
596 |
+
# Setup Tab
|
597 |
+
with gr.TabItem("βοΈ Setup", elem_classes=["tab-nav"]):
|
598 |
+
gr.HTML("""
|
599 |
+
<div class="setup-card">
|
600 |
+
<h3>π Configuration</h3>
|
601 |
+
<p>Enter your HuggingFace token to get started. This token is used to access the Llama-3.1-8B-Instruct model.</p>
|
602 |
+
</div>
|
603 |
+
""")
|
604 |
+
|
605 |
+
with gr.Row():
|
606 |
+
hf_token_input = gr.Textbox(
|
607 |
+
label="HuggingFace Token",
|
608 |
+
placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxx",
|
609 |
+
type="password",
|
610 |
+
elem_classes=["input-container"]
|
611 |
+
)
|
612 |
+
|
613 |
+
with gr.Row():
|
614 |
+
init_button = gr.Button(
|
615 |
+
"Initialize Application",
|
616 |
+
variant="primary",
|
617 |
+
elem_classes=["primary-button"]
|
618 |
+
)
|
619 |
+
|
620 |
+
init_status = gr.Textbox(
|
621 |
+
label="Status",
|
622 |
+
interactive=False,
|
623 |
+
elem_classes=["input-container"]
|
624 |
+
)
|
625 |
+
|
626 |
+
# Upload Tab
|
627 |
+
with gr.TabItem("π Upload Documents", elem_classes=["tab-nav"]):
|
628 |
+
gr.HTML("""
|
629 |
+
<div class="upload-card">
|
630 |
+
<h3>π Document Upload</h3>
|
631 |
+
<p>Upload your documents in any supported format: PDF, DOCX, PPTX, CSV, TXT, or Markdown.</p>
|
632 |
+
</div>
|
633 |
+
""")
|
634 |
+
|
635 |
+
file_upload = gr.File(
|
636 |
+
label="Choose Files",
|
637 |
+
file_count="multiple",
|
638 |
+
file_types=[".pdf", ".docx", ".pptx", ".csv", ".txt", ".md"],
|
639 |
+
elem_classes=["file-upload"]
|
640 |
+
)
|
641 |
+
|
642 |
+
upload_button = gr.Button(
|
643 |
+
"Process Documents",
|
644 |
+
variant="primary",
|
645 |
+
elem_classes=["primary-button"]
|
646 |
+
)
|
647 |
+
|
648 |
+
upload_status = gr.Textbox(
|
649 |
+
label="Processing Status",
|
650 |
+
interactive=False,
|
651 |
+
elem_classes=["input-container"]
|
652 |
+
)
|
653 |
+
|
654 |
+
# Chat Tab
|
655 |
+
with gr.TabItem("π¬ Chat", elem_classes=["tab-nav"]):
|
656 |
+
gr.HTML("""
|
657 |
+
<div class="chat-card">
|
658 |
+
<h3>π¨οΈ Ask Questions</h3>
|
659 |
+
<p>Ask questions about your uploaded documents. The AI will provide answers based on the document content.</p>
|
660 |
+
</div>
|
661 |
+
""")
|
662 |
+
|
663 |
+
chatbot = gr.Chatbot(
|
664 |
+
label="Conversation",
|
665 |
+
height=400,
|
666 |
+
elem_classes=["chat-container"]
|
667 |
+
)
|
668 |
+
|
669 |
+
with gr.Row():
|
670 |
+
query_input = gr.Textbox(
|
671 |
+
label="Your Question",
|
672 |
+
placeholder="What are the key findings in the document?",
|
673 |
+
elem_classes=["input-container"]
|
674 |
+
)
|
675 |
+
ask_button = gr.Button(
|
676 |
+
"Ask",
|
677 |
+
variant="primary",
|
678 |
+
elem_classes=["primary-button"]
|
679 |
+
)
|
680 |
+
|
681 |
+
gr.Examples(
|
682 |
+
examples=[
|
683 |
+
"What are the main topics covered in the documents?",
|
684 |
+
"Can you summarize the key findings?",
|
685 |
+
"What are the important metrics mentioned?",
|
686 |
+
"What recommendations are provided?",
|
687 |
+
],
|
688 |
+
inputs=query_input,
|
689 |
+
label="Example Questions"
|
690 |
+
)
|
691 |
+
|
692 |
+
# Architecture Tab
|
693 |
+
with gr.TabItem("ποΈ Architecture", elem_classes=["tab-nav"]):
|
694 |
+
gr.HTML("""
|
695 |
+
<div class="architecture-container">
|
696 |
+
<h3>ποΈ System Architecture</h3>
|
697 |
+
<p>This system uses an agentic architecture with Model Context Protocol (MCP) for inter-agent communication.</p>
|
698 |
+
</div>
|
699 |
+
""")
|
700 |
+
|
701 |
+
gr.Markdown("""
|
702 |
+
## π Agent Flow Diagram
|
703 |
+
|
704 |
+
```
|
705 |
+
User Upload β CoordinatorAgent β IngestionAgent β RetrievalAgent β LLMResponseAgent
|
706 |
+
β β β β β
|
707 |
+
Documents MCP Messages Text Chunks Vector Store Final Response
|
708 |
+
```
|
709 |
+
|
710 |
+
## π€ Agent Descriptions
|
711 |
+
|
712 |
+
- **CoordinatorAgent**: Orchestrates the entire workflow and manages MCP communication
|
713 |
+
- **IngestionAgent**: Parses and preprocesses documents (PDF, DOCX, PPTX, CSV, TXT, MD)
|
714 |
+
- **RetrievalAgent**: Handles embeddings and semantic retrieval using FAISS
|
715 |
+
- **LLMResponseAgent**: Generates final responses using Llama-3.1-8B-Instruct
|
716 |
+
|
717 |
+
## π Tech Stack
|
718 |
+
|
719 |
+
- **Frontend**: Gradio with custom CSS
|
720 |
+
- **LLM**: Meta Llama-3.1-8B-Instruct (via HuggingFace Inference)
|
721 |
+
- **Embeddings**: sentence-transformers/all-MiniLM-L6-v2
|
722 |
+
- **Vector Store**: FAISS
|
723 |
+
- **Document Processing**: PyPDF2, python-docx, python-pptx, pandas
|
724 |
+
- **Framework**: LangChain for document handling
|
725 |
+
|
726 |
+
## π¨ MCP Message Example
|
727 |
+
|
728 |
+
```json
|
729 |
+
{
|
730 |
+
"sender": "RetrievalAgent",
|
731 |
+
"receiver": "LLMResponseAgent",
|
732 |
+
"type": "RETRIEVAL_RESULT",
|
733 |
+
"trace_id": "rag-457",
|
734 |
+
"payload": {
|
735 |
+
"retrieved_context": ["Revenue increased by 25%", "Q1 KPIs exceeded targets"],
|
736 |
+
"query": "What were the Q1 KPIs?"
|
737 |
+
},
|
738 |
+
"timestamp": "2025-07-21T10:30:00Z"
|
739 |
+
}
|
740 |
+
```
|
741 |
+
""")
|
742 |
+
|
743 |
+
# Event handlers
|
744 |
+
init_button.click(
|
745 |
+
fn=initialize_app,
|
746 |
+
inputs=[hf_token_input],
|
747 |
+
outputs=[init_status]
|
748 |
+
)
|
749 |
+
|
750 |
+
upload_button.click(
|
751 |
+
fn=process_files,
|
752 |
+
inputs=[file_upload],
|
753 |
+
outputs=[upload_status]
|
754 |
+
)
|
755 |
+
|
756 |
+
ask_button.click(
|
757 |
+
fn=answer_question,
|
758 |
+
inputs=[query_input, chatbot],
|
759 |
+
outputs=[chatbot]
|
760 |
+
)
|
761 |
+
|
762 |
+
query_input.submit(
|
763 |
+
fn=answer_question,
|
764 |
+
inputs=[query_input, chatbot],
|
765 |
+
outputs=[chatbot]
|
766 |
+
)
|
767 |
+
|
768 |
+
return demo
|
769 |
+
|
770 |
+
if __name__ == "__main__":
|
771 |
+
demo = create_interface()
|
772 |
+
demo.launch(
|
773 |
+
share=True,
|
774 |
+
server_name="0.0.0.0",
|
775 |
+
server_port=7860,
|
776 |
+
show_api=False
|
777 |
+
)
|