File size: 32,231 Bytes
01e3388
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353cae9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
import os
import re
import json
import logging
import hashlib
from pathlib import Path
from typing import List, Tuple, Dict, Any, Optional

import gradio as gr
from dotenv import load_dotenv
from huggingface_hub import hf_hub_download
from huggingface_hub.utils import EntryNotFoundError
from requests.exceptions import HTTPError

from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# -----------------------------------------------------------------------------
# Configuration & Environment Variables
# -----------------------------------------------------------------------------
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
load_dotenv()  # Load .env file for local development

# --- API Keys ---
# Base key is for potentially pre-processing fixed files (if needed)
# User key is required for processing *new* dynamic files
BASE_OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # Used for pre-processing base files if needed
HF_TOKEN = os.getenv("HF_TOKEN")

# --- Constants ---
DATASET_ID = "rasoul-nikbakht/TSpec-LLM"
DATA_SUBDIR = "3GPP-clean"
EMBEDDING_MODEL = "text-embedding-3-small"
LLM_MODEL = "gpt-4o-mini"
MAX_DYNAMIC_FILES = 3
ESTIMATED_COST_PER_FILE_CENTS = 2 # Rough estimate

# --- File Paths ---
SCRIPT_DIR = Path(__file__).parent
CACHE_DIR = SCRIPT_DIR / "cached_embeddings"
BASE_KNOWLEDGE_INDEX_PATH = CACHE_DIR / "base_knowledge.faiss"
USER_DATA_PATH = SCRIPT_DIR / "user_data.json"
CACHE_MANIFEST_PATH = SCRIPT_DIR / "cache_manifest.json"

# Ensure cache directory exists
CACHE_DIR.mkdir(exist_ok=True)

# --- Fixed Base Knowledge Files ---
# Relative paths within the dataset repo (without DATA_SUBDIR)
FIXED_FILES = [
    "Rel-16/38_series/38901-g10.md",
    "Rel-16/38_series/38821-g20.md",
    "Rel-15/36_series/36777-f00_1.md",
    "Rel-15/36_series/36777-f00_2.md",
]

# -----------------------------------------------------------------------------
# Global Variables & In-Memory Stores (Load at startup)
# -----------------------------------------------------------------------------
base_knowledge_index: Optional[FAISS] = None
user_data: Dict[str, List[str]] = {} # {email: [list_of_processed_files]}
cache_manifest: Dict[str, str] = {} # {repo_relative_path: local_faiss_path}

# -----------------------------------------------------------------------------
# Helper Functions
# -----------------------------------------------------------------------------

def sanitize_path_for_filename(repo_path: str) -> str:
    """Creates a safe filename from a repository path."""
    # Remove base dir prefix if present
    if repo_path.startswith(f"{DATA_SUBDIR}/"):
         repo_path = repo_path[len(f"{DATA_SUBDIR}/"):]
    # Replace slashes and invalid chars; use hashing for very long paths if needed
    sanitized = re.sub(r'[\\/*?:"<>|]', '_', repo_path)
    # Optional: Limit length and add hash if too long
    if len(sanitized) > 100:
        hash_suffix = hashlib.md5(repo_path.encode()).hexdigest()[:8]
        sanitized = sanitized[:90] + "_" + hash_suffix
    return sanitized + ".faiss"

def is_valid_email(email: str) -> bool:
    """Basic regex check for email format."""
    # This is a simple check, not foolproof validation
    pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    return re.match(pattern, email) is not None

def load_user_data():
    """Loads user email and associated file data from JSON."""
    global user_data
    if USER_DATA_PATH.exists():
        try:
            with open(USER_DATA_PATH, 'r') as f:
                user_data = json.load(f)
            logging.info(f"Loaded user data for {len(user_data)} users from {USER_DATA_PATH}")
        except json.JSONDecodeError:
            logging.error(f"Error decoding JSON from {USER_DATA_PATH}. Starting with empty user data.")
            user_data = {}
        except Exception as e:
            logging.error(f"Failed to load user data: {e}", exc_info=True)
            user_data = {}
    else:
        logging.info("User data file not found. Starting fresh.")
        user_data = {}

def save_user_data():
    """Saves user email and associated file data to JSON."""
    try:
        with open(USER_DATA_PATH, 'w') as f:
            json.dump(user_data, f, indent=4)
        # logging.info(f"Saved user data to {USER_DATA_PATH}") # Can be noisy
    except Exception as e:
        logging.error(f"Failed to save user data: {e}", exc_info=True)

def load_cache_manifest():
    """Loads the manifest of locally cached embeddings."""
    global cache_manifest
    if CACHE_MANIFEST_PATH.exists():
        try:
            with open(CACHE_MANIFEST_PATH, 'r') as f:
                cache_manifest = json.load(f)
            logging.info(f"Loaded cache manifest with {len(cache_manifest)} entries from {CACHE_MANIFEST_PATH}")
            # Optional: Verify that the referenced FAISS files actually exist
            # keys_to_remove = [k for k, v in cache_manifest.items() if not Path(v).exists()]
            # if keys_to_remove:
            #     logging.warning(f"Removing {len(keys_to_remove)} stale entries from cache manifest.")
            #     for k in keys_to_remove: del cache_manifest
            #     save_cache_manifest() # Save cleaned manifest
        except json.JSONDecodeError:
            logging.error(f"Error decoding JSON from {CACHE_MANIFEST_PATH}. Starting with empty manifest.")
            cache_manifest = {}
        except Exception as e:
            logging.error(f"Failed to load cache manifest: {e}", exc_info=True)
            cache_manifest = {}
    else:
        logging.info("Cache manifest file not found. Starting fresh.")
        cache_manifest = {}

def save_cache_manifest():
    """Saves the manifest of locally cached embeddings."""
    try:
        with open(CACHE_MANIFEST_PATH, 'w') as f:
            json.dump(cache_manifest, f, indent=4)
        # logging.info(f"Saved cache manifest to {CACHE_MANIFEST_PATH}") # Can be noisy
    except Exception as e:
        logging.error(f"Failed to save cache manifest: {e}", exc_info=True)

def download_and_process_file(repo_relative_path: str, api_key_for_embedding: str) -> Optional[FAISS]:
    """Downloads, chunks, embeds a single file, returning a FAISS index."""
    if not HF_TOKEN:
         logging.error("HF_TOKEN is missing. Cannot download from gated dataset.")
         # Don't raise gr.Error here, handle return value in caller
         return None
    if not api_key_for_embedding:
        logging.error("OpenAI API Key is missing. Cannot create embeddings.")
        return None

    full_repo_path = f"{DATA_SUBDIR}/{repo_relative_path}"
    logging.info(f"Processing file: {repo_relative_path}")

    # --- Download ---
    try:
        local_path_str = hf_hub_download(
            repo_id=DATASET_ID,
            filename=full_repo_path,
            repo_type="dataset",
            token=HF_TOKEN,
            cache_dir="./hf_cache"
        )
        local_path = Path(local_path_str)
        logging.info(f"Downloaded {repo_relative_path} to: {local_path}")
    except EntryNotFoundError:
        logging.error(f"File not found in repository: {full_repo_path}")
        raise gr.Error(f"File not found in repository: '{repo_relative_path}'. Please check the path.")
    except HTTPError as e:
        if e.response is not None and e.response.status_code in {401, 403}:
            logging.error(f"Hugging Face authentication/authorization failed (Status {e.response.status_code}).")
            raise gr.Error("Hugging Face authentication failed. Check HF_TOKEN and dataset license acceptance.")
        else:
            logging.error(f"HTTP error during download: {e}")
            raise gr.Error(f"Failed to download file due to an HTTP error: {e}")
    except Exception as e:
        logging.error(f"An unexpected error occurred during download for {repo_relative_path}: {e}", exc_info=True)
        raise gr.Error(f"Download error for {repo_relative_path}: {e}")

    # --- Load and Chunk ---
    try:
        text = local_path.read_text(encoding="utf-8", errors="replace")
        headers_to_split_on = [("#", "H1"), ("##", "H2"), ("###", "H3"), ("####", "H4")]
        splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
        docs = splitter.split_text(text)

        if not docs or (len(docs) == 1 and len(docs[0].page_content) > 5000):
            logging.warning(f"MarkdownHeaderTextSplitter yielded few/large chunks for {repo_relative_path}, using RecursiveCharacterTextSplitter.")
            fallback_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000, chunk_overlap=150, separators=["\n\n", "\n", ". ", ", ", " ", ""]
            )
            docs = fallback_splitter.create_documents([text])

        if not docs:
            logging.warning(f"File '{repo_relative_path}' resulted in zero documents after splitting.")
            return None # Cannot create index from no documents
        logging.info(f"Split {repo_relative_path} into {len(docs)} documents.")

    except Exception as e:
        logging.error(f"Failed to read/split file {local_path}: {e}", exc_info=True)
        raise gr.Error(f"Error processing content of {repo_relative_path}: {e}")

    # --- Embed and Create Vector Store ---
    try:
        embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=api_key_for_embedding)
        vectordb = FAISS.from_documents(docs, embeddings)
        logging.info(f"Created FAISS index for {repo_relative_path}.")
        return vectordb
    except Exception as e:
        # Catch potential OpenAI API errors specifically if possible
        logging.error(f"Failed during embedding/vector store creation for {repo_relative_path}: {e}", exc_info=True)
        # Check for common errors based on string matching (less robust but helpful)
        if "AuthenticationError" in str(e) or "Incorrect API key" in str(e):
             raise gr.Error(f"OpenAI Authentication Error for {repo_relative_path}. Check your API Key. Details: {e}")
        elif "RateLimitError" in str(e):
             raise gr.Error(f"OpenAI Rate Limit Error for {repo_relative_path}. Details: {e}")
        else:
             raise gr.Error(f"Embedding/VectorStore Error for {repo_relative_path}: {e}")


def get_or_create_dynamic_index(repo_relative_path: str, user_api_key: str) -> Optional[FAISS]:
    """Loads a dynamic index from cache or creates+caches it if new."""
    global cache_manifest # Allow modification

    if repo_relative_path in cache_manifest:
        local_faiss_path_str = cache_manifest[repo_relative_path]
        local_faiss_path = Path(local_faiss_path_str)
        if local_faiss_path.exists():
            try:
                # Need embeddings object to load; use user's key as they initiated the session
                embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=user_api_key)
                index = FAISS.load_local(str(local_faiss_path.parent), embeddings, index_name=local_faiss_path.stem, allow_dangerous_deserialization=True)
                logging.info(f"Loaded cached index for {repo_relative_path} from {local_faiss_path_str}")
                return index
            except Exception as e:
                logging.error(f"Failed to load cached index {local_faiss_path_str}: {e}. Will try to re-create.", exc_info=True)
                # Remove potentially corrupted entry from manifest
                del cache_manifest[repo_relative_path]
                save_cache_manifest()
        else:
            logging.warning(f"Cache manifest points to non-existent file: {local_faiss_path_str}. Removing entry.")
            del cache_manifest[repo_relative_path]
            save_cache_manifest()

    # --- If not cached or loading failed, create it ---
    logging.info(f"Cache miss or load failure for {repo_relative_path}. Processing anew.")
    if not user_api_key:
         raise gr.Error(f"Cannot process new file '{repo_relative_path}' without an OpenAI API Key.")

    new_index = download_and_process_file(repo_relative_path, user_api_key)

    if new_index:
        # Save the newly created index
        try:
            sanitized_name = sanitize_path_for_filename(repo_relative_path)
            save_path = CACHE_DIR / sanitized_name
            # FAISS save_local saves folder and index_name.faiss/pkl inside it
            new_index.save_local(folder_path=str(CACHE_DIR), index_name=save_path.stem)
            full_saved_path = str(CACHE_DIR / (save_path.stem + ".faiss")) # Path to the actual .faiss file

            # Update manifest
            cache_manifest[repo_relative_path] = full_saved_path
            save_cache_manifest()
            logging.info(f"Saved new index for {repo_relative_path} to {full_saved_path} and updated manifest.")
            return new_index
        except Exception as e:
            logging.error(f"Failed to save new index for {repo_relative_path}: {e}", exc_info=True)
            # Don't raise here, maybe it works in memory for the session
            return new_index # Return in-memory index even if saving failed
    else:
        # download_and_process_file failed, error already raised or logged
        return None

# -----------------------------------------------------------------------------
# Pre-processing Base Knowledge (Run once at startup if needed)
# -----------------------------------------------------------------------------
def preprocess_base_knowledge():
    """Creates and saves the base knowledge FAISS index if it doesn't exist by processing files individually and merging."""
    global base_knowledge_index
    if BASE_KNOWLEDGE_INDEX_PATH.exists():
        try:
            if not BASE_OPENAI_API_KEY:
                 logging.error("Base OpenAI API Key missing. Cannot load base knowledge index.")
                 return
            embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=BASE_OPENAI_API_KEY)
            base_knowledge_index = FAISS.load_local(
                str(BASE_KNOWLEDGE_INDEX_PATH.parent),
                embeddings,
                index_name=BASE_KNOWLEDGE_INDEX_PATH.stem,
                allow_dangerous_deserialization=True
            )
            logging.info(f"Successfully loaded base knowledge index from {BASE_KNOWLEDGE_INDEX_PATH}")
            return # Successfully loaded, no need to rebuild
        except Exception as e:
            logging.error(f"Failed to load base knowledge index from {BASE_KNOWLEDGE_INDEX_PATH}: {e}. Will attempt to rebuild.", exc_info=True)
            base_knowledge_index = None
            # Optionally delete corrupted files:
            # try:
            #     if BASE_KNOWLEDGE_INDEX_PATH.exists(): BASE_KNOWLEDGE_INDEX_PATH.unlink()
            #     pkl_path = BASE_KNOWLEDGE_INDEX_PATH.with_suffix(".pkl")
            #     if pkl_path.exists(): pkl_path.unlink()
            # except OSError as rm_err:
            #     logging.error(f"Failed to delete potentially corrupted index files: {rm_err}")


    if base_knowledge_index is None:
        logging.info("Base knowledge index not found or failed to load. Starting pre-processing...")
        if not BASE_OPENAI_API_KEY:
            logging.error("Cannot pre-process base knowledge: BASE_OPENAI_API_KEY is not set.")
            raise RuntimeError("OpenAI API Key needed for initial base knowledge processing is not configured.")
        if not HF_TOKEN:
            logging.error("Cannot pre-process base knowledge: HF_TOKEN is not set.")
            raise RuntimeError("Hugging Face Token needed for initial base knowledge processing is not configured.")

        individual_indices : List[FAISS] = [] # Store index for each base file

        for file_path in FIXED_FILES:
            try:
                # Process each file individually to get its FAISS index
                # This ensures embedding requests are per-file, not one giant batch
                index = download_and_process_file(file_path, BASE_OPENAI_API_KEY)
                if index:
                     individual_indices.append(index)
                     # Note: document count is now per-file in logs from download_and_process_file
                     logging.info(f"Successfully processed base file: {file_path}")
                else:
                     logging.warning(f"Skipping base file {file_path} due to processing error (returned None index).")

            except Exception as e:
                # If download_and_process_file raises an error (e.g., download failed, API key invalid)
                logging.error(f"Failed processing base file {file_path}: {e}", exc_info=True)
                # Decide whether to stop or continue; let's stop to avoid partial base index
                raise RuntimeError(f"Failed to process base file {file_path}. Cannot create complete base knowledge index.") from e

        if not individual_indices:
            logging.error("No individual indices were successfully created for the base knowledge. Cannot proceed.")
            raise RuntimeError("Failed to process any base files successfully.")

        try:
            logging.info(f"Merging {len(individual_indices)} individual indices into the final base knowledge index...")
            # Start with the first index
            base_knowledge_index = individual_indices[0]
            # Merge the rest
            if len(individual_indices) > 1:
                for index_to_merge in individual_indices[1:]:
                    base_knowledge_index.merge_from(index_to_merge)

            total_vectors = base_knowledge_index.index.ntotal
            logging.info(f"Final base knowledge index created with {total_vectors} total vectors.")

            # Save the final merged index
            base_knowledge_index.save_local(folder_path=str(CACHE_DIR), index_name=BASE_KNOWLEDGE_INDEX_PATH.stem)
            logging.info(f"Successfully saved merged base knowledge index to {BASE_KNOWLEDGE_INDEX_PATH}")

        except Exception as e:
            logging.error(f"Failed to merge individual indices or save the final base knowledge index: {e}", exc_info=True)
            # Set base_knowledge_index back to None so app knows it failed
            base_knowledge_index = None
            raise RuntimeError("Failed to merge or save the final base knowledge index.") from e
# -----------------------------------------------------------------------------
# Gradio Chat Function
# -----------------------------------------------------------------------------

GradioChatMessages = List[Dict[str, str]] # [{'role': 'user', 'content': 'hi'}, ...]

def chat_llm(
    user_email: str,
    user_openai_key: str,
    dynamic_files_str: str,
    question: str,
    history: GradioChatMessages
) -> Tuple[GradioChatMessages, str, str]: # History, Clear Question Box, Status Update
    """
    Gradio callback function. Performs RAG QA for one turn.
    Uses base knowledge + dynamically loaded/cached files.
    """
    status_update = ""
    if not history: history = [] # Initialize history

    # --- Input Validation ---
    if not user_email or not is_valid_email(user_email):
        raise gr.Error("Please enter a valid email address.")
    if not question or not question.strip():
        raise gr.Error("Please enter a question.")

    # Parse and validate dynamic file paths
    dynamic_files = [f.strip() for f in dynamic_files_str.split(',') if f.strip()]
    if len(dynamic_files) > MAX_DYNAMIC_FILES:
        raise gr.Error(f"Please select a maximum of {MAX_DYNAMIC_FILES} dynamic files per session.")
    if dynamic_files and not user_openai_key:
        raise gr.Error("Please provide your OpenAI API Key to process dynamic files.")

    # Log user interaction
    logging.info(f"Chat request from: {user_email}, Dynamic files: {dynamic_files}, Question: '{question[:50]}...'")

    # Use provided key or fallback to base key if available (only if no dynamic files)
    # If dynamic files are present, user_openai_key MUST be used and validated
    api_key_to_use = user_openai_key if dynamic_files else (user_openai_key or BASE_OPENAI_API_KEY)
    if not api_key_to_use:
         raise gr.Error("An OpenAI API Key is required for this operation (either user-provided or pre-configured).")


    session_indices : List[FAISS] = []
    processed_dynamic_files_this_session : List[str] = []
    newly_cached_files: List[str] = []

    # --- Retriever Setup ---
    # 1. Add Base Knowledge
    if base_knowledge_index:
        session_indices.append(base_knowledge_index)
        logging.debug("Added base knowledge index to session.")
    else:
        logging.error("Base knowledge index is not loaded. Cannot proceed.")
        raise gr.Error("Base knowledge index is unavailable. Please check logs.")

    # 2. Process Dynamic Files
    for file_path in dynamic_files:
        try:
            was_cached = file_path in cache_manifest
            dynamic_index = get_or_create_dynamic_index(file_path, api_key_to_use) # Use the determined API key
            if dynamic_index:
                session_indices.append(dynamic_index)
                processed_dynamic_files_this_session.append(file_path)
                if not was_cached: # If it wasn't in the manifest before get_or_create ran
                    newly_cached_files.append(file_path)
            # else: Error handled within get_or_create_dynamic_index by raising gr.Error

        except gr.Error as e:
            # Propagate Gradio errors to UI
             raise e
        except Exception as e:
             logging.error(f"Unexpected error processing dynamic file {file_path}: {e}", exc_info=True)
             raise gr.Error(f"Failed to process dynamic file {file_path}: {e}")


    # --- Combine Indices for Session (if dynamic files were added) ---
    if len(session_indices) > 1 : # Need to merge if dynamic files were added
        try:
            logging.info(f"Merging {len(session_indices)} indices for the session...")
            # Create a temporary merged index for this session
            # Start with the first index (should be base knowledge)
            session_master_index = FAISS(
                 embedding_function=session_indices[0].embeddings, # Use embeddings from first index
                 index=session_indices[0].index,
                 docstore=session_indices[0].docstore,
                 index_to_docstore_id=session_indices[0].index_to_docstore_id
            )
            # Merge subsequent indices
            for index_to_merge in session_indices[1:]:
                 session_master_index.merge_from(index_to_merge)
            logging.info(f"Session index created with {session_master_index.index.ntotal} total vectors.")
            session_retriever = session_master_index.as_retriever(search_kwargs={"k": 5})
        except Exception as e:
            logging.error(f"Failed to merge session indices: {e}", exc_info=True)
            raise gr.Error(f"Error creating session knowledge base: {e}")
    elif session_indices: # Only base knowledge was used
         session_retriever = session_indices[0].as_retriever(search_kwargs={"k": 5})
    else:
         # Should have been caught earlier if base_knowledge_index was None
         raise gr.Error("No knowledge base available for retrieval.")


    # --- Setup LLM and RAG Chain ---
    try:
        llm = ChatOpenAI(model=LLM_MODEL, temperature=0.1, api_key=api_key_to_use, max_retries=1)

        template = """You are an assistant specializing in 3GPP technical specifications.
Answer the following question based *only* on the provided context document snippets from the specified files.
The context comes from the base knowledge files and potentially these user-provided files: {dynamic_files_list_str}
If the answer is not found in the context, state that you cannot answer based on the provided information. Be concise and accurate.

Context:
{context}

Question:
{question}

Answer:"""
        prompt = ChatPromptTemplate.from_template(template)

        # Function to format retrieved documents
        def format_docs(docs):
            return "\n\n".join(doc.page_content for doc in docs)

        # RAG Chain
        rag_chain = (
            {"context": session_retriever | format_docs,
             "question": RunnablePassthrough(),
             "dynamic_files_list_str" : lambda x: ", ".join(dynamic_files) or "None"} # Pass dynamic files for context
            | prompt
            | llm
            | StrOutputParser()
        )

        logging.info(f"Invoking RAG chain for question: '{question[:50]}...'")
        answer = rag_chain.invoke(question)
        logging.info(f"Received answer: '{answer[:100]}...'")

        # Update user data
        if user_email not in user_data: user_data[user_email] = []
        updated_files_for_user = set(user_data[user_email]) | set(processed_dynamic_files_this_session)
        user_data[user_email] = sorted(list(updated_files_for_user))
        save_user_data() # Save after successful interaction

        # Prepare status update message
        if newly_cached_files:
            status_update = f"Info: The following new files were processed and cached for future use: {', '.join(newly_cached_files)}."

    except Exception as e:
        logging.error(f"Error during RAG chain execution or user data update: {e}", exc_info=True)
        # Append error to chat instead of crashing
        history.append({"role": "user", "content": question})
        history.append({"role": "assistant", "content": f"An error occurred: {e}"})
        return history, question, "Error occurred. Check logs." # Keep question in box

    # --- Update History and Return ---
    history.append({"role": "user", "content": question})
    history.append({"role": "assistant", "content": answer})

    return history, "", status_update # Clear question box, provide status


# -----------------------------------------------------------------------------
# Gradio UI Definition
# -----------------------------------------------------------------------------

# --- UI Text Blocks ---

# Construct the cached files list string separately
sorted_keys = sorted(list(cache_manifest.keys()))
if sorted_keys:
    # Format each key as a markdown bullet point with backticks
    formatted_items = [f"*   `{key}`" for key in sorted_keys]
    # Join them with newlines
    file_list_str = "\n".join(formatted_items)
else:
    file_list_str = "*   None yet." # Message when no files are cached

# Now define the info string using the pre-formatted list
cached_files_info = f"""
**Available Cached Files:**
The following dynamically added files have already been processed and cached:
{file_list_str}
"""

# --- The rest of the UI text blocks (disclaimer_text, base_knowledge_info) remain the same ---
disclaimer_text = f"""
**Disclaimer & Usage Notes:**
*   **Research Preview:** This is a demonstration application for research purposes. Accuracy is not guaranteed.
*   **License:** By using this application, you agree to the terms and license of the underlying dataset (`{DATASET_ID}`). Please review the dataset's license terms on Hugging Face Hub.
*   **API Keys:** Your OpenAI API key is required to process *new* documents you specify. It is used solely for embedding generation during your session and is not stored persistently by this application.
*   **Caching:** Processed dynamic files are cached locally (embeddings only) to speed up future sessions.
*   **Estimated Cost:** Processing *new* files incurs OpenAI API costs (approx. ${ESTIMATED_COST_PER_FILE_CENTS / 100:.2f} per file for `{EMBEDDING_MODEL}`). Using already cached files or only the base knowledge is free within this app.
*   **Data:** Your email is logged along with the files you process for usage tracking. See `{USER_DATA_PATH.name}`.
"""

base_knowledge_info = f"""
**Base Knowledge:**
The chatbot always has access to the following pre-processed 3GPP specification files:
*   `{FIXED_FILES[0]}`
*   `{FIXED_FILES[1]}`
*   `{FIXED_FILES[2]}`
*   `{FIXED_FILES[3]}`
"""


# --- Build UI ---
with gr.Blocks(theme=gr.themes.Soft(), title="3GPP TSpec RAG Assistant") as demo:
    gr.Markdown("# πŸ“„ 3GPP TSpec RAG Assistant")

    with gr.Row():
        # --- Left Column (Chat Interface) ---
        with gr.Column(scale=7): # 70% width
            chatbot = gr.Chatbot(
                label="Chat Session",
                height=600,
                type="messages",
                show_copy_button=True,
            )
            question_inp = gr.Textbox(
                label="Your Question",
                placeholder="Ask a question about the selected documents...",
                lines=3
            )
            status_out = gr.Textbox(label="Status Updates", interactive=False)

        # --- Right Column (Controls & Info) ---
        with gr.Column(scale=3): # 30% width
            gr.Markdown("### Session Configuration")
            email_inp = gr.Textbox(label="Your Email Address", placeholder="Enter your email...")
            openai_key_inp = gr.Textbox(
                label="Your OpenAI API Key (Required for new files)",
                placeholder="Enter your OpenAI API key (sk-...)",
                type="password"
            )
            dynamic_files_inp = gr.Textbox(
                label=f"Dynamic Files (Optional, max {MAX_DYNAMIC_FILES}, comma-separated)",
                placeholder="e.g., Rel-17/23_series/23501-h50.md, Rel-18/...",
                lines=3
            )
            ask_btn = gr.Button("Ask Question", variant="primary")

            with gr.Accordion("Usage Information & Disclaimers", open=False):
                 gr.Markdown(disclaimer_text)
            with gr.Accordion("Base Knowledge Files", open=False):
                 gr.Markdown(base_knowledge_info)
            with gr.Accordion("Cached Dynamic Files", open=True):
                 # Use an HTML component to allow dynamic updates if needed later
                 # For now, just display the initial list
                 # cached_list_html = gr.HTML(value=f"<ul><li>{ '</li><li>'.join(sorted(list(cache_manifest.keys()))) or 'None' }</li></ul>")
                 # Simpler Markdown display:
                 cached_list_md = gr.Markdown(cached_files_info)


    # --- Event Handling ---
    ask_btn.click(
        fn=chat_llm,
        inputs=[email_inp, openai_key_inp, dynamic_files_inp, question_inp, chatbot],
        outputs=[chatbot, question_inp, status_out] # Update chat, clear question, show status
    )

    # Example Button (Optional - might be less useful with dynamic files)
    # gr.Examples(...)


# -----------------------------------------------------------------------------
# Application Entry Point & Initial Setup
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    print("Starting application setup...")
    # 1. Load user data and cache manifest
    print("Loading user data...")
    load_user_data()
    print("Loading cache manifest...")
    load_cache_manifest()
    print(f"Found {len(cache_manifest)} cached files.")

    # 2. Ensure base knowledge index is ready
    print("Checking base knowledge index...")
    try:
        preprocess_base_knowledge()
        print("Base knowledge index is ready.")
    except Exception as e:
         print(f"\n!!! CRITICAL ERROR during base knowledge setup: {e} !!!")
         print("The application cannot start without the base knowledge index.")
         print("Please ensure BASE_OPENAI_API_KEY and HF_TOKEN are correctly set in your environment or .env file and you have accepted the dataset license.")
         # Exit if base knowledge failed critically
         import sys
         sys.exit(1)

    # 3. Launch Gradio App
    print("Launching Gradio interface...")
    demo.launch(debug=True, mcp_server=True) # debug=True for detailed logs locally