Rasoul Nikbakht commited on
Commit
01e3388
·
1 Parent(s): 57ec4a7

Create a chat interface with default knowledge.

Browse files
Files changed (5) hide show
  1. .gitignore +25 -0
  2. app.py +663 -0
  3. cache_manifest.json +0 -0
  4. requirements.txt +10 -0
  5. user_data.json +3 -0
.gitignore ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cache directories
2
+ cached_embeddings/
3
+ hf_cache/
4
+ __pycache__/
5
+
6
+ # Environment variables
7
+ .env
8
+
9
+ # OS specific files
10
+ .DS_Store
11
+ *.pyc
12
+ *.pyo
13
+
14
+ # Potential IDE files
15
+ .vscode/
16
+ .idea/
17
+
18
+ # Log files (if any planned)
19
+ *.log
20
+
21
+ # User data (consider if this should be gitignored - depends on use case)
22
+ # If it tracks users across deployments, maybe keep it, but be mindful of privacy.
23
+ # If it's just for local testing, ignore it.
24
+ # user_data.json
25
+ # cache_manifest.json
app.py ADDED
@@ -0,0 +1,663 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import logging
5
+ import hashlib
6
+ from pathlib import Path
7
+ from typing import List, Tuple, Dict, Any, Optional
8
+
9
+ import gradio as gr
10
+ from dotenv import load_dotenv
11
+ from huggingface_hub import hf_hub_download
12
+ from huggingface_hub.utils import EntryNotFoundError
13
+ from requests.exceptions import HTTPError
14
+
15
+ from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
16
+ from langchain_community.vectorstores import FAISS
17
+ from langchain_core.documents import Document
18
+ from langchain_core.output_parsers import StrOutputParser
19
+ from langchain_core.prompts import ChatPromptTemplate
20
+ from langchain_core.runnables import RunnableParallel, RunnablePassthrough
21
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
22
+
23
+ # -----------------------------------------------------------------------------
24
+ # Configuration & Environment Variables
25
+ # -----------------------------------------------------------------------------
26
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
27
+ load_dotenv() # Load .env file for local development
28
+
29
+ # --- API Keys ---
30
+ # Base key is for potentially pre-processing fixed files (if needed)
31
+ # User key is required for processing *new* dynamic files
32
+ BASE_OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # Used for pre-processing base files if needed
33
+ HF_TOKEN = os.getenv("HF_TOKEN")
34
+
35
+ # --- Constants ---
36
+ DATASET_ID = "rasoul-nikbakht/TSpec-LLM"
37
+ DATA_SUBDIR = "3GPP-clean"
38
+ EMBEDDING_MODEL = "text-embedding-3-small"
39
+ LLM_MODEL = "gpt-4o-mini"
40
+ MAX_DYNAMIC_FILES = 3
41
+ ESTIMATED_COST_PER_FILE_CENTS = 2 # Rough estimate
42
+
43
+ # --- File Paths ---
44
+ SCRIPT_DIR = Path(__file__).parent
45
+ CACHE_DIR = SCRIPT_DIR / "cached_embeddings"
46
+ BASE_KNOWLEDGE_INDEX_PATH = CACHE_DIR / "base_knowledge.faiss"
47
+ USER_DATA_PATH = SCRIPT_DIR / "user_data.json"
48
+ CACHE_MANIFEST_PATH = SCRIPT_DIR / "cache_manifest.json"
49
+
50
+ # Ensure cache directory exists
51
+ CACHE_DIR.mkdir(exist_ok=True)
52
+
53
+ # --- Fixed Base Knowledge Files ---
54
+ # Relative paths within the dataset repo (without DATA_SUBDIR)
55
+ FIXED_FILES = [
56
+ "Rel-16/38_series/38901-g10.md",
57
+ "Rel-16/38_series/38821-g20.md",
58
+ "Rel-15/36_series/36777-f00_1.md",
59
+ "Rel-15/36_series/36777-f00_2.md",
60
+ ]
61
+
62
+ # -----------------------------------------------------------------------------
63
+ # Global Variables & In-Memory Stores (Load at startup)
64
+ # -----------------------------------------------------------------------------
65
+ base_knowledge_index: Optional[FAISS] = None
66
+ user_data: Dict[str, List[str]] = {} # {email: [list_of_processed_files]}
67
+ cache_manifest: Dict[str, str] = {} # {repo_relative_path: local_faiss_path}
68
+
69
+ # -----------------------------------------------------------------------------
70
+ # Helper Functions
71
+ # -----------------------------------------------------------------------------
72
+
73
+ def sanitize_path_for_filename(repo_path: str) -> str:
74
+ """Creates a safe filename from a repository path."""
75
+ # Remove base dir prefix if present
76
+ if repo_path.startswith(f"{DATA_SUBDIR}/"):
77
+ repo_path = repo_path[len(f"{DATA_SUBDIR}/"):]
78
+ # Replace slashes and invalid chars; use hashing for very long paths if needed
79
+ sanitized = re.sub(r'[\\/*?:"<>|]', '_', repo_path)
80
+ # Optional: Limit length and add hash if too long
81
+ if len(sanitized) > 100:
82
+ hash_suffix = hashlib.md5(repo_path.encode()).hexdigest()[:8]
83
+ sanitized = sanitized[:90] + "_" + hash_suffix
84
+ return sanitized + ".faiss"
85
+
86
+ def is_valid_email(email: str) -> bool:
87
+ """Basic regex check for email format."""
88
+ # This is a simple check, not foolproof validation
89
+ pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
90
+ return re.match(pattern, email) is not None
91
+
92
+ def load_user_data():
93
+ """Loads user email and associated file data from JSON."""
94
+ global user_data
95
+ if USER_DATA_PATH.exists():
96
+ try:
97
+ with open(USER_DATA_PATH, 'r') as f:
98
+ user_data = json.load(f)
99
+ logging.info(f"Loaded user data for {len(user_data)} users from {USER_DATA_PATH}")
100
+ except json.JSONDecodeError:
101
+ logging.error(f"Error decoding JSON from {USER_DATA_PATH}. Starting with empty user data.")
102
+ user_data = {}
103
+ except Exception as e:
104
+ logging.error(f"Failed to load user data: {e}", exc_info=True)
105
+ user_data = {}
106
+ else:
107
+ logging.info("User data file not found. Starting fresh.")
108
+ user_data = {}
109
+
110
+ def save_user_data():
111
+ """Saves user email and associated file data to JSON."""
112
+ try:
113
+ with open(USER_DATA_PATH, 'w') as f:
114
+ json.dump(user_data, f, indent=4)
115
+ # logging.info(f"Saved user data to {USER_DATA_PATH}") # Can be noisy
116
+ except Exception as e:
117
+ logging.error(f"Failed to save user data: {e}", exc_info=True)
118
+
119
+ def load_cache_manifest():
120
+ """Loads the manifest of locally cached embeddings."""
121
+ global cache_manifest
122
+ if CACHE_MANIFEST_PATH.exists():
123
+ try:
124
+ with open(CACHE_MANIFEST_PATH, 'r') as f:
125
+ cache_manifest = json.load(f)
126
+ logging.info(f"Loaded cache manifest with {len(cache_manifest)} entries from {CACHE_MANIFEST_PATH}")
127
+ # Optional: Verify that the referenced FAISS files actually exist
128
+ # keys_to_remove = [k for k, v in cache_manifest.items() if not Path(v).exists()]
129
+ # if keys_to_remove:
130
+ # logging.warning(f"Removing {len(keys_to_remove)} stale entries from cache manifest.")
131
+ # for k in keys_to_remove: del cache_manifest
132
+ # save_cache_manifest() # Save cleaned manifest
133
+ except json.JSONDecodeError:
134
+ logging.error(f"Error decoding JSON from {CACHE_MANIFEST_PATH}. Starting with empty manifest.")
135
+ cache_manifest = {}
136
+ except Exception as e:
137
+ logging.error(f"Failed to load cache manifest: {e}", exc_info=True)
138
+ cache_manifest = {}
139
+ else:
140
+ logging.info("Cache manifest file not found. Starting fresh.")
141
+ cache_manifest = {}
142
+
143
+ def save_cache_manifest():
144
+ """Saves the manifest of locally cached embeddings."""
145
+ try:
146
+ with open(CACHE_MANIFEST_PATH, 'w') as f:
147
+ json.dump(cache_manifest, f, indent=4)
148
+ # logging.info(f"Saved cache manifest to {CACHE_MANIFEST_PATH}") # Can be noisy
149
+ except Exception as e:
150
+ logging.error(f"Failed to save cache manifest: {e}", exc_info=True)
151
+
152
+ def download_and_process_file(repo_relative_path: str, api_key_for_embedding: str) -> Optional[FAISS]:
153
+ """Downloads, chunks, embeds a single file, returning a FAISS index."""
154
+ if not HF_TOKEN:
155
+ logging.error("HF_TOKEN is missing. Cannot download from gated dataset.")
156
+ # Don't raise gr.Error here, handle return value in caller
157
+ return None
158
+ if not api_key_for_embedding:
159
+ logging.error("OpenAI API Key is missing. Cannot create embeddings.")
160
+ return None
161
+
162
+ full_repo_path = f"{DATA_SUBDIR}/{repo_relative_path}"
163
+ logging.info(f"Processing file: {repo_relative_path}")
164
+
165
+ # --- Download ---
166
+ try:
167
+ local_path_str = hf_hub_download(
168
+ repo_id=DATASET_ID,
169
+ filename=full_repo_path,
170
+ repo_type="dataset",
171
+ token=HF_TOKEN,
172
+ cache_dir="./hf_cache"
173
+ )
174
+ local_path = Path(local_path_str)
175
+ logging.info(f"Downloaded {repo_relative_path} to: {local_path}")
176
+ except EntryNotFoundError:
177
+ logging.error(f"File not found in repository: {full_repo_path}")
178
+ raise gr.Error(f"File not found in repository: '{repo_relative_path}'. Please check the path.")
179
+ except HTTPError as e:
180
+ if e.response is not None and e.response.status_code in {401, 403}:
181
+ logging.error(f"Hugging Face authentication/authorization failed (Status {e.response.status_code}).")
182
+ raise gr.Error("Hugging Face authentication failed. Check HF_TOKEN and dataset license acceptance.")
183
+ else:
184
+ logging.error(f"HTTP error during download: {e}")
185
+ raise gr.Error(f"Failed to download file due to an HTTP error: {e}")
186
+ except Exception as e:
187
+ logging.error(f"An unexpected error occurred during download for {repo_relative_path}: {e}", exc_info=True)
188
+ raise gr.Error(f"Download error for {repo_relative_path}: {e}")
189
+
190
+ # --- Load and Chunk ---
191
+ try:
192
+ text = local_path.read_text(encoding="utf-8", errors="replace")
193
+ headers_to_split_on = [("#", "H1"), ("##", "H2"), ("###", "H3"), ("####", "H4")]
194
+ splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
195
+ docs = splitter.split_text(text)
196
+
197
+ if not docs or (len(docs) == 1 and len(docs[0].page_content) > 5000):
198
+ logging.warning(f"MarkdownHeaderTextSplitter yielded few/large chunks for {repo_relative_path}, using RecursiveCharacterTextSplitter.")
199
+ fallback_splitter = RecursiveCharacterTextSplitter(
200
+ chunk_size=1000, chunk_overlap=150, separators=["\n\n", "\n", ". ", ", ", " ", ""]
201
+ )
202
+ docs = fallback_splitter.create_documents([text])
203
+
204
+ if not docs:
205
+ logging.warning(f"File '{repo_relative_path}' resulted in zero documents after splitting.")
206
+ return None # Cannot create index from no documents
207
+ logging.info(f"Split {repo_relative_path} into {len(docs)} documents.")
208
+
209
+ except Exception as e:
210
+ logging.error(f"Failed to read/split file {local_path}: {e}", exc_info=True)
211
+ raise gr.Error(f"Error processing content of {repo_relative_path}: {e}")
212
+
213
+ # --- Embed and Create Vector Store ---
214
+ try:
215
+ embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=api_key_for_embedding)
216
+ vectordb = FAISS.from_documents(docs, embeddings)
217
+ logging.info(f"Created FAISS index for {repo_relative_path}.")
218
+ return vectordb
219
+ except Exception as e:
220
+ # Catch potential OpenAI API errors specifically if possible
221
+ logging.error(f"Failed during embedding/vector store creation for {repo_relative_path}: {e}", exc_info=True)
222
+ # Check for common errors based on string matching (less robust but helpful)
223
+ if "AuthenticationError" in str(e) or "Incorrect API key" in str(e):
224
+ raise gr.Error(f"OpenAI Authentication Error for {repo_relative_path}. Check your API Key. Details: {e}")
225
+ elif "RateLimitError" in str(e):
226
+ raise gr.Error(f"OpenAI Rate Limit Error for {repo_relative_path}. Details: {e}")
227
+ else:
228
+ raise gr.Error(f"Embedding/VectorStore Error for {repo_relative_path}: {e}")
229
+
230
+
231
+ def get_or_create_dynamic_index(repo_relative_path: str, user_api_key: str) -> Optional[FAISS]:
232
+ """Loads a dynamic index from cache or creates+caches it if new."""
233
+ global cache_manifest # Allow modification
234
+
235
+ if repo_relative_path in cache_manifest:
236
+ local_faiss_path_str = cache_manifest[repo_relative_path]
237
+ local_faiss_path = Path(local_faiss_path_str)
238
+ if local_faiss_path.exists():
239
+ try:
240
+ # Need embeddings object to load; use user's key as they initiated the session
241
+ embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=user_api_key)
242
+ index = FAISS.load_local(str(local_faiss_path.parent), embeddings, index_name=local_faiss_path.stem, allow_dangerous_deserialization=True)
243
+ logging.info(f"Loaded cached index for {repo_relative_path} from {local_faiss_path_str}")
244
+ return index
245
+ except Exception as e:
246
+ logging.error(f"Failed to load cached index {local_faiss_path_str}: {e}. Will try to re-create.", exc_info=True)
247
+ # Remove potentially corrupted entry from manifest
248
+ del cache_manifest[repo_relative_path]
249
+ save_cache_manifest()
250
+ else:
251
+ logging.warning(f"Cache manifest points to non-existent file: {local_faiss_path_str}. Removing entry.")
252
+ del cache_manifest[repo_relative_path]
253
+ save_cache_manifest()
254
+
255
+ # --- If not cached or loading failed, create it ---
256
+ logging.info(f"Cache miss or load failure for {repo_relative_path}. Processing anew.")
257
+ if not user_api_key:
258
+ raise gr.Error(f"Cannot process new file '{repo_relative_path}' without an OpenAI API Key.")
259
+
260
+ new_index = download_and_process_file(repo_relative_path, user_api_key)
261
+
262
+ if new_index:
263
+ # Save the newly created index
264
+ try:
265
+ sanitized_name = sanitize_path_for_filename(repo_relative_path)
266
+ save_path = CACHE_DIR / sanitized_name
267
+ # FAISS save_local saves folder and index_name.faiss/pkl inside it
268
+ new_index.save_local(folder_path=str(CACHE_DIR), index_name=save_path.stem)
269
+ full_saved_path = str(CACHE_DIR / (save_path.stem + ".faiss")) # Path to the actual .faiss file
270
+
271
+ # Update manifest
272
+ cache_manifest[repo_relative_path] = full_saved_path
273
+ save_cache_manifest()
274
+ logging.info(f"Saved new index for {repo_relative_path} to {full_saved_path} and updated manifest.")
275
+ return new_index
276
+ except Exception as e:
277
+ logging.error(f"Failed to save new index for {repo_relative_path}: {e}", exc_info=True)
278
+ # Don't raise here, maybe it works in memory for the session
279
+ return new_index # Return in-memory index even if saving failed
280
+ else:
281
+ # download_and_process_file failed, error already raised or logged
282
+ return None
283
+
284
+ # -----------------------------------------------------------------------------
285
+ # Pre-processing Base Knowledge (Run once at startup if needed)
286
+ # -----------------------------------------------------------------------------
287
+ def preprocess_base_knowledge():
288
+ """Creates and saves the base knowledge FAISS index if it doesn't exist by processing files individually and merging."""
289
+ global base_knowledge_index
290
+ if BASE_KNOWLEDGE_INDEX_PATH.exists():
291
+ try:
292
+ if not BASE_OPENAI_API_KEY:
293
+ logging.error("Base OpenAI API Key missing. Cannot load base knowledge index.")
294
+ return
295
+ embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=BASE_OPENAI_API_KEY)
296
+ base_knowledge_index = FAISS.load_local(
297
+ str(BASE_KNOWLEDGE_INDEX_PATH.parent),
298
+ embeddings,
299
+ index_name=BASE_KNOWLEDGE_INDEX_PATH.stem,
300
+ allow_dangerous_deserialization=True
301
+ )
302
+ logging.info(f"Successfully loaded base knowledge index from {BASE_KNOWLEDGE_INDEX_PATH}")
303
+ return # Successfully loaded, no need to rebuild
304
+ except Exception as e:
305
+ logging.error(f"Failed to load base knowledge index from {BASE_KNOWLEDGE_INDEX_PATH}: {e}. Will attempt to rebuild.", exc_info=True)
306
+ base_knowledge_index = None
307
+ # Optionally delete corrupted files:
308
+ # try:
309
+ # if BASE_KNOWLEDGE_INDEX_PATH.exists(): BASE_KNOWLEDGE_INDEX_PATH.unlink()
310
+ # pkl_path = BASE_KNOWLEDGE_INDEX_PATH.with_suffix(".pkl")
311
+ # if pkl_path.exists(): pkl_path.unlink()
312
+ # except OSError as rm_err:
313
+ # logging.error(f"Failed to delete potentially corrupted index files: {rm_err}")
314
+
315
+
316
+ if base_knowledge_index is None:
317
+ logging.info("Base knowledge index not found or failed to load. Starting pre-processing...")
318
+ if not BASE_OPENAI_API_KEY:
319
+ logging.error("Cannot pre-process base knowledge: BASE_OPENAI_API_KEY is not set.")
320
+ raise RuntimeError("OpenAI API Key needed for initial base knowledge processing is not configured.")
321
+ if not HF_TOKEN:
322
+ logging.error("Cannot pre-process base knowledge: HF_TOKEN is not set.")
323
+ raise RuntimeError("Hugging Face Token needed for initial base knowledge processing is not configured.")
324
+
325
+ individual_indices : List[FAISS] = [] # Store index for each base file
326
+
327
+ for file_path in FIXED_FILES:
328
+ try:
329
+ # Process each file individually to get its FAISS index
330
+ # This ensures embedding requests are per-file, not one giant batch
331
+ index = download_and_process_file(file_path, BASE_OPENAI_API_KEY)
332
+ if index:
333
+ individual_indices.append(index)
334
+ # Note: document count is now per-file in logs from download_and_process_file
335
+ logging.info(f"Successfully processed base file: {file_path}")
336
+ else:
337
+ logging.warning(f"Skipping base file {file_path} due to processing error (returned None index).")
338
+
339
+ except Exception as e:
340
+ # If download_and_process_file raises an error (e.g., download failed, API key invalid)
341
+ logging.error(f"Failed processing base file {file_path}: {e}", exc_info=True)
342
+ # Decide whether to stop or continue; let's stop to avoid partial base index
343
+ raise RuntimeError(f"Failed to process base file {file_path}. Cannot create complete base knowledge index.") from e
344
+
345
+ if not individual_indices:
346
+ logging.error("No individual indices were successfully created for the base knowledge. Cannot proceed.")
347
+ raise RuntimeError("Failed to process any base files successfully.")
348
+
349
+ try:
350
+ logging.info(f"Merging {len(individual_indices)} individual indices into the final base knowledge index...")
351
+ # Start with the first index
352
+ base_knowledge_index = individual_indices[0]
353
+ # Merge the rest
354
+ if len(individual_indices) > 1:
355
+ for index_to_merge in individual_indices[1:]:
356
+ base_knowledge_index.merge_from(index_to_merge)
357
+
358
+ total_vectors = base_knowledge_index.index.ntotal
359
+ logging.info(f"Final base knowledge index created with {total_vectors} total vectors.")
360
+
361
+ # Save the final merged index
362
+ base_knowledge_index.save_local(folder_path=str(CACHE_DIR), index_name=BASE_KNOWLEDGE_INDEX_PATH.stem)
363
+ logging.info(f"Successfully saved merged base knowledge index to {BASE_KNOWLEDGE_INDEX_PATH}")
364
+
365
+ except Exception as e:
366
+ logging.error(f"Failed to merge individual indices or save the final base knowledge index: {e}", exc_info=True)
367
+ # Set base_knowledge_index back to None so app knows it failed
368
+ base_knowledge_index = None
369
+ raise RuntimeError("Failed to merge or save the final base knowledge index.") from e
370
+ # -----------------------------------------------------------------------------
371
+ # Gradio Chat Function
372
+ # -----------------------------------------------------------------------------
373
+
374
+ GradioChatMessages = List[Dict[str, str]] # [{'role': 'user', 'content': 'hi'}, ...]
375
+
376
+ def chat_llm(
377
+ user_email: str,
378
+ user_openai_key: str,
379
+ dynamic_files_str: str,
380
+ question: str,
381
+ history: GradioChatMessages
382
+ ) -> Tuple[GradioChatMessages, str, str]: # History, Clear Question Box, Status Update
383
+ """
384
+ Gradio callback function. Performs RAG QA for one turn.
385
+ Uses base knowledge + dynamically loaded/cached files.
386
+ """
387
+ status_update = ""
388
+ if not history: history = [] # Initialize history
389
+
390
+ # --- Input Validation ---
391
+ if not user_email or not is_valid_email(user_email):
392
+ raise gr.Error("Please enter a valid email address.")
393
+ if not question or not question.strip():
394
+ raise gr.Error("Please enter a question.")
395
+
396
+ # Parse and validate dynamic file paths
397
+ dynamic_files = [f.strip() for f in dynamic_files_str.split(',') if f.strip()]
398
+ if len(dynamic_files) > MAX_DYNAMIC_FILES:
399
+ raise gr.Error(f"Please select a maximum of {MAX_DYNAMIC_FILES} dynamic files per session.")
400
+ if dynamic_files and not user_openai_key:
401
+ raise gr.Error("Please provide your OpenAI API Key to process dynamic files.")
402
+
403
+ # Log user interaction
404
+ logging.info(f"Chat request from: {user_email}, Dynamic files: {dynamic_files}, Question: '{question[:50]}...'")
405
+
406
+ # Use provided key or fallback to base key if available (only if no dynamic files)
407
+ # If dynamic files are present, user_openai_key MUST be used and validated
408
+ api_key_to_use = user_openai_key if dynamic_files else (user_openai_key or BASE_OPENAI_API_KEY)
409
+ if not api_key_to_use:
410
+ raise gr.Error("An OpenAI API Key is required for this operation (either user-provided or pre-configured).")
411
+
412
+
413
+ session_indices : List[FAISS] = []
414
+ processed_dynamic_files_this_session : List[str] = []
415
+ newly_cached_files: List[str] = []
416
+
417
+ # --- Retriever Setup ---
418
+ # 1. Add Base Knowledge
419
+ if base_knowledge_index:
420
+ session_indices.append(base_knowledge_index)
421
+ logging.debug("Added base knowledge index to session.")
422
+ else:
423
+ logging.error("Base knowledge index is not loaded. Cannot proceed.")
424
+ raise gr.Error("Base knowledge index is unavailable. Please check logs.")
425
+
426
+ # 2. Process Dynamic Files
427
+ for file_path in dynamic_files:
428
+ try:
429
+ was_cached = file_path in cache_manifest
430
+ dynamic_index = get_or_create_dynamic_index(file_path, api_key_to_use) # Use the determined API key
431
+ if dynamic_index:
432
+ session_indices.append(dynamic_index)
433
+ processed_dynamic_files_this_session.append(file_path)
434
+ if not was_cached: # If it wasn't in the manifest before get_or_create ran
435
+ newly_cached_files.append(file_path)
436
+ # else: Error handled within get_or_create_dynamic_index by raising gr.Error
437
+
438
+ except gr.Error as e:
439
+ # Propagate Gradio errors to UI
440
+ raise e
441
+ except Exception as e:
442
+ logging.error(f"Unexpected error processing dynamic file {file_path}: {e}", exc_info=True)
443
+ raise gr.Error(f"Failed to process dynamic file {file_path}: {e}")
444
+
445
+
446
+ # --- Combine Indices for Session (if dynamic files were added) ---
447
+ if len(session_indices) > 1 : # Need to merge if dynamic files were added
448
+ try:
449
+ logging.info(f"Merging {len(session_indices)} indices for the session...")
450
+ # Create a temporary merged index for this session
451
+ # Start with the first index (should be base knowledge)
452
+ session_master_index = FAISS(
453
+ embedding_function=session_indices[0].embeddings, # Use embeddings from first index
454
+ index=session_indices[0].index,
455
+ docstore=session_indices[0].docstore,
456
+ index_to_docstore_id=session_indices[0].index_to_docstore_id
457
+ )
458
+ # Merge subsequent indices
459
+ for index_to_merge in session_indices[1:]:
460
+ session_master_index.merge_from(index_to_merge)
461
+ logging.info(f"Session index created with {session_master_index.index.ntotal} total vectors.")
462
+ session_retriever = session_master_index.as_retriever(search_kwargs={"k": 5})
463
+ except Exception as e:
464
+ logging.error(f"Failed to merge session indices: {e}", exc_info=True)
465
+ raise gr.Error(f"Error creating session knowledge base: {e}")
466
+ elif session_indices: # Only base knowledge was used
467
+ session_retriever = session_indices[0].as_retriever(search_kwargs={"k": 5})
468
+ else:
469
+ # Should have been caught earlier if base_knowledge_index was None
470
+ raise gr.Error("No knowledge base available for retrieval.")
471
+
472
+
473
+ # --- Setup LLM and RAG Chain ---
474
+ try:
475
+ llm = ChatOpenAI(model=LLM_MODEL, temperature=0.1, api_key=api_key_to_use, max_retries=1)
476
+
477
+ template = """You are an assistant specializing in 3GPP technical specifications.
478
+ Answer the following question based *only* on the provided context document snippets from the specified files.
479
+ The context comes from the base knowledge files and potentially these user-provided files: {dynamic_files_list_str}
480
+ If the answer is not found in the context, state that you cannot answer based on the provided information. Be concise and accurate.
481
+
482
+ Context:
483
+ {context}
484
+
485
+ Question:
486
+ {question}
487
+
488
+ Answer:"""
489
+ prompt = ChatPromptTemplate.from_template(template)
490
+
491
+ # Function to format retrieved documents
492
+ def format_docs(docs):
493
+ return "\n\n".join(doc.page_content for doc in docs)
494
+
495
+ # RAG Chain
496
+ rag_chain = (
497
+ {"context": session_retriever | format_docs,
498
+ "question": RunnablePassthrough(),
499
+ "dynamic_files_list_str" : lambda x: ", ".join(dynamic_files) or "None"} # Pass dynamic files for context
500
+ | prompt
501
+ | llm
502
+ | StrOutputParser()
503
+ )
504
+
505
+ logging.info(f"Invoking RAG chain for question: '{question[:50]}...'")
506
+ answer = rag_chain.invoke(question)
507
+ logging.info(f"Received answer: '{answer[:100]}...'")
508
+
509
+ # Update user data
510
+ if user_email not in user_data: user_data[user_email] = []
511
+ updated_files_for_user = set(user_data[user_email]) | set(processed_dynamic_files_this_session)
512
+ user_data[user_email] = sorted(list(updated_files_for_user))
513
+ save_user_data() # Save after successful interaction
514
+
515
+ # Prepare status update message
516
+ if newly_cached_files:
517
+ status_update = f"Info: The following new files were processed and cached for future use: {', '.join(newly_cached_files)}."
518
+
519
+ except Exception as e:
520
+ logging.error(f"Error during RAG chain execution or user data update: {e}", exc_info=True)
521
+ # Append error to chat instead of crashing
522
+ history.append({"role": "user", "content": question})
523
+ history.append({"role": "assistant", "content": f"An error occurred: {e}"})
524
+ return history, question, "Error occurred. Check logs." # Keep question in box
525
+
526
+ # --- Update History and Return ---
527
+ history.append({"role": "user", "content": question})
528
+ history.append({"role": "assistant", "content": answer})
529
+
530
+ return history, "", status_update # Clear question box, provide status
531
+
532
+
533
+ # -----------------------------------------------------------------------------
534
+ # Gradio UI Definition
535
+ # -----------------------------------------------------------------------------
536
+
537
+ # --- UI Text Blocks ---
538
+
539
+ # Construct the cached files list string separately
540
+ sorted_keys = sorted(list(cache_manifest.keys()))
541
+ if sorted_keys:
542
+ # Format each key as a markdown bullet point with backticks
543
+ formatted_items = [f"* `{key}`" for key in sorted_keys]
544
+ # Join them with newlines
545
+ file_list_str = "\n".join(formatted_items)
546
+ else:
547
+ file_list_str = "* None yet." # Message when no files are cached
548
+
549
+ # Now define the info string using the pre-formatted list
550
+ cached_files_info = f"""
551
+ **Available Cached Files:**
552
+ The following dynamically added files have already been processed and cached:
553
+ {file_list_str}
554
+ """
555
+
556
+ # --- The rest of the UI text blocks (disclaimer_text, base_knowledge_info) remain the same ---
557
+ disclaimer_text = f"""
558
+ **Disclaimer & Usage Notes:**
559
+ * **Research Preview:** This is a demonstration application for research purposes. Accuracy is not guaranteed.
560
+ * **License:** By using this application, you agree to the terms and license of the underlying dataset (`{DATASET_ID}`). Please review the dataset's license terms on Hugging Face Hub.
561
+ * **API Keys:** Your OpenAI API key is required to process *new* documents you specify. It is used solely for embedding generation during your session and is not stored persistently by this application.
562
+ * **Caching:** Processed dynamic files are cached locally (embeddings only) to speed up future sessions.
563
+ * **Estimated Cost:** Processing *new* files incurs OpenAI API costs (approx. ${ESTIMATED_COST_PER_FILE_CENTS / 100:.2f} per file for `{EMBEDDING_MODEL}`). Using already cached files or only the base knowledge is free within this app.
564
+ * **Data:** Your email is logged along with the files you process for usage tracking. See `{USER_DATA_PATH.name}`.
565
+ """
566
+
567
+ base_knowledge_info = f"""
568
+ **Base Knowledge:**
569
+ The chatbot always has access to the following pre-processed 3GPP specification files:
570
+ * `{FIXED_FILES[0]}`
571
+ * `{FIXED_FILES[1]}`
572
+ * `{FIXED_FILES[2]}`
573
+ * `{FIXED_FILES[3]}`
574
+ """
575
+
576
+
577
+ # --- Build UI ---
578
+ with gr.Blocks(theme=gr.themes.Soft(), title="3GPP TSpec RAG Assistant") as demo:
579
+ gr.Markdown("# 📄 3GPP TSpec RAG Assistant")
580
+
581
+ with gr.Row():
582
+ # --- Left Column (Chat Interface) ---
583
+ with gr.Column(scale=7): # 70% width
584
+ chatbot = gr.Chatbot(
585
+ label="Chat Session",
586
+ height=600,
587
+ type="messages",
588
+ show_copy_button=True,
589
+ )
590
+ question_inp = gr.Textbox(
591
+ label="Your Question",
592
+ placeholder="Ask a question about the selected documents...",
593
+ lines=3
594
+ )
595
+ status_out = gr.Textbox(label="Status Updates", interactive=False)
596
+
597
+ # --- Right Column (Controls & Info) ---
598
+ with gr.Column(scale=3): # 30% width
599
+ gr.Markdown("### Session Configuration")
600
+ email_inp = gr.Textbox(label="Your Email Address", placeholder="Enter your email...")
601
+ openai_key_inp = gr.Textbox(
602
+ label="Your OpenAI API Key (Required for new files)",
603
+ placeholder="Enter your OpenAI API key (sk-...)",
604
+ type="password"
605
+ )
606
+ dynamic_files_inp = gr.Textbox(
607
+ label=f"Dynamic Files (Optional, max {MAX_DYNAMIC_FILES}, comma-separated)",
608
+ placeholder="e.g., Rel-17/23_series/23501-h50.md, Rel-18/...",
609
+ lines=3
610
+ )
611
+ ask_btn = gr.Button("Ask Question", variant="primary")
612
+
613
+ with gr.Accordion("Usage Information & Disclaimers", open=False):
614
+ gr.Markdown(disclaimer_text)
615
+ with gr.Accordion("Base Knowledge Files", open=False):
616
+ gr.Markdown(base_knowledge_info)
617
+ with gr.Accordion("Cached Dynamic Files", open=True):
618
+ # Use an HTML component to allow dynamic updates if needed later
619
+ # For now, just display the initial list
620
+ # cached_list_html = gr.HTML(value=f"<ul><li>{ '</li><li>'.join(sorted(list(cache_manifest.keys()))) or 'None' }</li></ul>")
621
+ # Simpler Markdown display:
622
+ cached_list_md = gr.Markdown(cached_files_info)
623
+
624
+
625
+ # --- Event Handling ---
626
+ ask_btn.click(
627
+ fn=chat_llm,
628
+ inputs=[email_inp, openai_key_inp, dynamic_files_inp, question_inp, chatbot],
629
+ outputs=[chatbot, question_inp, status_out] # Update chat, clear question, show status
630
+ )
631
+
632
+ # Example Button (Optional - might be less useful with dynamic files)
633
+ # gr.Examples(...)
634
+
635
+
636
+ # -----------------------------------------------------------------------------
637
+ # Application Entry Point & Initial Setup
638
+ # -----------------------------------------------------------------------------
639
+ if __name__ == "__main__":
640
+ print("Starting application setup...")
641
+ # 1. Load user data and cache manifest
642
+ print("Loading user data...")
643
+ load_user_data()
644
+ print("Loading cache manifest...")
645
+ load_cache_manifest()
646
+ print(f"Found {len(cache_manifest)} cached files.")
647
+
648
+ # 2. Ensure base knowledge index is ready
649
+ print("Checking base knowledge index...")
650
+ try:
651
+ preprocess_base_knowledge()
652
+ print("Base knowledge index is ready.")
653
+ except Exception as e:
654
+ print(f"\n!!! CRITICAL ERROR during base knowledge setup: {e} !!!")
655
+ print("The application cannot start without the base knowledge index.")
656
+ print("Please ensure BASE_OPENAI_API_KEY and HF_TOKEN are correctly set in your environment or .env file and you have accepted the dataset license.")
657
+ # Exit if base knowledge failed critically
658
+ import sys
659
+ sys.exit(1)
660
+
661
+ # 3. Launch Gradio App
662
+ print("Launching Gradio interface...")
663
+ demo.launch(debug=True) # debug=True for detailed logs locally
cache_manifest.json ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=5.29.0
2
+ huggingface_hub
3
+ langchain
4
+ langchain-openai
5
+ openai
6
+ faiss-cpu
7
+ tiktoken
8
+ python-dotenv
9
+ markdown-it-py
10
+ mdit_plain
user_data.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+
3
+ }